diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 8186da507a442..3e30b04d4f1f4 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -892,7 +892,9 @@ if (onnxruntime_USE_QNN OR onnxruntime_USE_QNN_INTERFACE)
       if (${QNN_ARCH_ABI} STREQUAL "aarch64-windows-msvc" OR ${QNN_ARCH_ABI} STREQUAL "arm64x-windows-msvc")
         file(GLOB EXTRA_HTP_LIB LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so"
                                                "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so"
-                                               "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libqnnhtpv73.cat")
+                                               "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libqnnhtpv73.cat"
+                                               "${onnxruntime_QNN_HOME}/lib/hexagon-v81/unsigned/libQnnHtpV81Skel.so"
+                                               "${onnxruntime_QNN_HOME}/lib/hexagon-v81/unsigned/libqnnhtpv81.cat")
         list(APPEND QNN_LIB_FILES ${EXTRA_HTP_LIB})
       endif()
       message(STATUS "QNN lib files: " ${QNN_LIB_FILES})
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index e6bbebdbf3ab8..e772a63f8419c 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -3937,6 +3937,7 @@ struct OrtApi {
    *      -# "69"
    *      -# "73"
    *      -# "75"
+   *      -# "81"
    *   "device_id": The ID of the device to use when setting 'htp_arch'. Defaults to "0" (for single device).
    *   "enable_htp_fp16_precision": Used for float32 model for HTP backend.
    *      Enable the float32 model to be inferenced with fp16 precision. Otherwise, it will be fp32 precision.
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 5bcb8ca394346..fa6e0697e9e39 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -32,6 +32,11 @@
 // Flag to determine if Backend should do node validation for each opNode added
 #define DO_GRAPH_NODE_VALIDATIONS 1
 
+// Ensure that we have a recent enough version of QNN
+static_assert(QNN_API_VERSION_MAJOR > 2 ||
+                  (QNN_API_VERSION_MAJOR == 2 && QNN_API_VERSION_MINOR >= 29),
+              "Minimum required QAIRT SDK version is 2.39.0");
+
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 2bdbfb9c1c62e..ec6243c3e4cb1 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -174,6 +174,8 @@ static void ParseHtpArchitecture(const std::string& htp_arch_string, QnnHtpDevic
     qnn_htp_arch = QNN_HTP_DEVICE_ARCH_V73;
   } else if (htp_arch_string == "75") {
     qnn_htp_arch = QNN_HTP_DEVICE_ARCH_V75;
+  } else if (htp_arch_string == "81") {
+    qnn_htp_arch = QNN_HTP_DEVICE_ARCH_V81;
   } else {
     LOGS_DEFAULT(WARNING) << "Invalid HTP architecture: " << htp_arch_string;
   }
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 463634b370d4c..9e69156efefa1 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -84,7 +84,7 @@ void usage() {
       "\t    '0', '1', '2', '3', default is '0'.\n"
       "\t    [QNN only] [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
       "\t    [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. \n"
-      "\t    Options are '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
+      "\t    Options are '0', '68', '69', '73', '75', '81'. Defaults to '0' (none). \n"
       "\t    [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
       "\t    [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
       "\t    Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
@@ -607,7 +607,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
             ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
           }
         } else if (key == "htp_arch") {
-          std::unordered_set<std::string> supported_htp_archs = {"0", "68", "69", "73", "75"};
+          std::unordered_set<std::string> supported_htp_archs = {"0", "68", "69", "73", "75", "81"};
           if (supported_htp_archs.find(value) == supported_htp_archs.end()) {
             std::ostringstream str_stream;
             std::copy(supported_htp_archs.begin(), supported_htp_archs.end(),
diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc
index 4383faab369ab..a4caea3dc3a75 100644
--- a/onnxruntime/test/providers/qnn/gemm_op_test.cc
+++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc
@@ -447,8 +447,19 @@ TEST_F(QnnHTPBackendTests, Gemm_Static_B_And_Bias) {
                                         ExpectedEPNodeAssignment::All);
 }
 
+// Broken on v79 and v81 devices:
+// Inaccuracy detected for output 'output_0', element 0
+// output_range=31.434787750244141, tolerance=0.40000000596046448%.
+// Expected val (f32@CPU_EP): 29.434776306152344
+// qdq@QNN_EP val: 28.229671478271484 (err: 1.2051048278808594, err/output_range: 3.8336660861968994%)
+// qdq@CPU_EP val: 29.092588424682617 (err: 0.34218788146972656, err/output_range: 1.0885642766952515%)
+// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 2.7451016902923584%
 // Test 8-bit QDQ Gemm with transposed A/B and static B and Bias inputs.
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+TEST_F(QnnHTPBackendTests, DISABLED_Gemm_TransAB_Static_B_And_Bias_U8) {
+#else
 TEST_F(QnnHTPBackendTests, Gemm_TransAB_Static_B_And_Bias_U8) {
+#endif
   std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
   std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
   std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
@@ -475,8 +486,19 @@ TEST_F(QnnHTPBackendTests, Gemm_TransAB_Static_B_And_Bias_U16Act_U8Weight) {
                                          true);  // Use com.microsoft Q/DQ ops
 }
 
+// Broken on v79 and v81 devices:
+// Inaccuracy detected for output 'output_0', element 0
+// output_range=31.434787750244141, tolerance=0.40000000596046448%.
+// Expected val (f32@CPU_EP): 29.434776306152344
+// qdq@QNN_EP val: 28.229671478271484 (err: 1.2051048278808594, err/output_range: 3.8336660861968994%)
+// qdq@CPU_EP val: 29.092588424682617 (err: 0.34218788146972656, err/output_range: 1.0885642766952515%)
+// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 2.7451016902923584%
 // Test QDQ Gemm with transposed A/B and dynamic (i.e., not initializer) B and Bias inputs.
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+TEST_F(QnnHTPBackendTests, DISABLED_Gemm_TransAB_Dynamic_B_And_Bias) {
+#else
 TEST_F(QnnHTPBackendTests, Gemm_TransAB_Dynamic_B_And_Bias) {
+#endif
   std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
   std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
   std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
diff --git a/onnxruntime/test/providers/qnn/lrn_op_test.cc b/onnxruntime/test/providers/qnn/lrn_op_test.cc
index de12bfebe3f42..dc1967ef29482 100644
--- a/onnxruntime/test/providers/qnn/lrn_op_test.cc
+++ b/onnxruntime/test/providers/qnn/lrn_op_test.cc
@@ -124,7 +124,18 @@ TEST_F(QnnCPUBackendTests, LRN_size_larger_than_channel) {
 // HTP tests:
 //
 
+// Broken on v79 and v81 devices:
+// Inaccuracy detected for output 'output_0', element 309
+// output_range=19.910608291625977, tolerance=0.40000000596046448%.
+// Expected val (f32@CPU_EP): -9.4876022338867188
+// qdq@QNN_EP val: -9.3696985244750977 (err: 0.11790370941162109, err/output_range: 0.59216529130935669%)
+// qdq@CPU_EP val: -9.5258598327636719 (err: 0.038257598876953125, err/output_range: 0.19214680790901184%)
+// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 0.40001851320266724%
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+TEST_F(QnnHTPBackendTests, DISABLED_LRNSize3) {
+#else
 TEST_F(QnnHTPBackendTests, LRNSize3) {
+#endif
   RunQDQLRNOpTest<uint8_t>(TestInputDef<float>({1, 128, 4, 5}, false, -10.0f, 10.0f),
                            3,  // Size
                            ExpectedEPNodeAssignment::All,
@@ -134,7 +145,18 @@ TEST_F(QnnHTPBackendTests, LRNSize3) {
                            13);      // opset
 }
 
+// Broken on v79 devices:
+// Inaccuracy detected for output 'output_0', element 185
+// output_range=19.911705017089844, tolerance=0.40000000596046448%.
+// Expected val (f32@CPU_EP): -5.3502998352050781
+// qdq@QNN_EP val: -5.2317028045654297 (err: 0.11859703063964844, err/output_range: 0.59561461210250854%)
+// qdq@CPU_EP val: -5.3878731727600098 (err: 0.037573337554931641, err/output_range: 0.18869975209236145%)
+// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 0.40691488981246948%
+#if defined(__aarch64__)
+TEST_F(QnnHTPBackendTests, DISABLED_LRNSize5) {
+#else
 TEST_F(QnnHTPBackendTests, LRNSize5) {
+#endif
   RunQDQLRNOpTest<uint8_t>(TestInputDef<float>({1, 128, 4, 5}, false, -10.0f, 10.0f),
                            5,  // Size
                            ExpectedEPNodeAssignment::All,
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp
index eb06643cfc119..143784039c3af 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cpp
+++ b/onnxruntime/test/providers/qnn/matmul_test.cpp
@@ -250,7 +250,19 @@ TEST_F(QnnHTPBackendTests, DISABLED_MatMulOp) {
   // RunMatMulOpTest({3, 3, 3}, {3, 2}, true, false, ExpectedEPNodeAssignment::All, "htp", 18, 1e-2f);
 }
 
+// Broken on v79 and v81 devices with several results outside of acceptable tolerance.
+// Example:
+// Inaccuracy detected for output 'output_0', element 0
+// output_range=0.010000000707805157, tolerance=0.40000000596046448%.
+// Expected val (f32@CPU_EP): 0.010000000707805157
+// qdq@QNN_EP val: 0.0099215693771839142 (err: 7.8431330621242523e-05, err/output_range: 0.78431320190429688%)
+// qdq@CPU_EP val: 0.010000000707805157 (err: 0, err/output_range: 0%)
+// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 0.78431320190429688%
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+TEST_F(QnnHTPBackendTests, DISABLED_MatMulOp_QDQ) {
+#else
 TEST_F(QnnHTPBackendTests, MatMulOp_QDQ) {
+#endif
   // UINT8
   // RunQDQMatMulOpTest(shape_0, shape_1, is_initializer_0, is_initializer_1, expected_ep_assignment, opset,
   // use_contrib_qdq)
diff --git a/onnxruntime/test/providers/qnn/reduce_op_test.cc b/onnxruntime/test/providers/qnn/reduce_op_test.cc
index 0884a0fc80165..44a55ff0ba9dc 100644
--- a/onnxruntime/test/providers/qnn/reduce_op_test.cc
+++ b/onnxruntime/test/providers/qnn/reduce_op_test.cc
@@ -430,12 +430,23 @@ static void RunReduceOpQDQTest(const std::string& op_type,
 // ReduceSum
 //
 
+// Broken on v79 and v81 devices:
+// Inaccuracy detected for output 'output_0', element 0
+// output_range=2.785210132598877, tolerance=0.40000000596046448%.
+// Expected val (f32@CPU_EP): -2.785210132598877
+// qdq@QNN_EP val: -2.6541414260864258 (err: 0.13106870651245117, err/output_range: 4.7058820724487305%)
+// qdq@CPU_EP val: -2.7415206432342529 (err: 0.043689489364624023, err/output_range: 1.5686246156692505%)
+// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 3.1372575759887695%
 // Test creates a Q -> DQ -> ReduceSum -> Q -> DQ graph, and checks that all
 // nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
 //
 // - Uses uint8 as the quantization type.
 // - Uses opset 13, which has "axes" as an input.
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+TEST_F(QnnHTPBackendTests, DISABLED_ReduceSumU8Opset13) {
+#else
 TEST_F(QnnHTPBackendTests, ReduceSumU8Opset13) {
+#endif
   RunReduceOpQDQTest<uint8_t>("ReduceSum",
                               TestInputDef<float>({2, 2}, false, {-10.0f, 3.21289f, -5.9981f, 10.0f}),
                               {0, 1},  // axes
@@ -454,12 +465,23 @@ TEST_F(QnnHTPBackendTests, ReduceSumU8Opset13_LastAxis) {
                               13,    // opset
                               ExpectedEPNodeAssignment::All);
 }
+// Broken on v79 and v81 devices:
+// Inaccuracy detected for output 'output_0', element 0
+// output_range=2.785210132598877, tolerance=0.40000000596046448%.
+// Expected val (f32@CPU_EP): -2.785210132598877
+// qdq@QNN_EP val: -2.6541414260864258 (err: 0.13106870651245117, err/output_range: 4.7058820724487305%)
+// qdq@CPU_EP val: -2.7415206432342529 (err: 0.043689489364624023, err/output_range: 1.5686246156692505%)
+// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 3.1372575759887695%
 // Test creates a Q -> DQ -> ReduceSum -> Q -> DQ graph, and checks that all
 // nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
 //
 // - Uses uint8 as the quantization type.
 // - Uses opset 11, which has "axes" as an attribute.
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+TEST_F(QnnHTPBackendTests, DISABLED_ReduceSumU8Opset11) {
+#else
 TEST_F(QnnHTPBackendTests, ReduceSumU8Opset11) {
+#endif
   RunReduceOpQDQTest<uint8_t>("ReduceSum",
                               TestInputDef<float>({2, 2}, false, {-10.0f, 3.21289f, -5.9981f, 10.0f}),
                               {0, 1},  // axes
@@ -628,12 +650,23 @@ TEST_F(QnnHTPBackendTests, ReduceMinS8Opset18) {
 // ReduceMean
 //
 
+// Broken on v79 and v81 devices:
+// Inaccuracy detected for output 'output_0', element 0
+// output_range=0.69630253314971924, tolerance=0.40000000596046448%.
+// Expected val (f32@CPU_EP): -0.69630253314971924
+// qdq@QNN_EP val: -0.66353535652160645 (err: 0.032767176628112793, err/output_range: 4.7058820724487305%)
+// qdq@CPU_EP val: -0.68538016080856323 (err: 0.010922372341156006, err/output_range: 1.5686246156692505%)
+// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 3.1372575759887695%
 // Test creates a Q -> DQ -> ReduceMean -> Q -> DQ graph, and checks that all
 // nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
 //
 // - Uses uint8 as the quantization type.
 // - Uses opset 18, which has "axes" as an input.
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+TEST_F(QnnHTPBackendTests, DISABLED_ReduceMeanU8Opset18) {
+#else
 TEST_F(QnnHTPBackendTests, ReduceMeanU8Opset18) {
+#endif
   RunReduceOpQDQTest<uint8_t>("ReduceMean",
                               TestInputDef<float>({2, 2}, false, {-10.0f, 3.21289f, -5.9981f, 10.0f}),
                               {0, 1},  // axes
@@ -653,12 +686,23 @@ TEST_F(QnnHTPBackendTests, ReduceMeanU8Opset18_LastAxis) {
                               ExpectedEPNodeAssignment::All);
 }
 
+// Broken on v79 and v81 devices:
+// Inaccuracy detected for output 'output_0', element 0
+// output_range=0.69630253314971924, tolerance=0.40000000596046448%.
+// Expected val (f32@CPU_EP): -0.69630253314971924
+// qdq@QNN_EP val: -0.66353535652160645 (err: 0.032767176628112793, err/output_range: 4.7058820724487305%)
+// qdq@CPU_EP val: -0.68538016080856323 (err: 0.010922372341156006, err/output_range: 1.5686246156692505%)
+// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 3.1372575759887695%
 // Test creates a Q -> DQ -> ReduceMean -> Q -> DQ graph, and checks that all
 // nodes are supported by the QNN EP, and that the inference results match the CPU EP results.
 //
 // - Uses uint8 as the quantization type.
 // - Uses opset 13, which has "axes" as an attribute.
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+TEST_F(QnnHTPBackendTests, DISABLED_ReduceMeanU8Opset13) {
+#else
 TEST_F(QnnHTPBackendTests, ReduceMeanU8Opset13) {
+#endif
   RunReduceOpQDQTest<uint8_t>("ReduceMean",
                               TestInputDef<float>({2, 2}, false, {-10.0f, 3.21289f, -5.9981f, 10.0f}),
                               {0, 1},  // axes
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index efa7769e272f4..a1f085500a148 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -710,8 +710,19 @@ TEST_F(QnnHTPBackendTests, UnaryOp_Abs_U16) {
                          true);        // Use com.microsoft domain for Q/DQ ops
 }
 
+// Broken on v79 and v81 devices:
+// Inaccuracy detected for output 'output_0', element 0
+// output_range=24, tolerance=0.40000000596046448%.
+// Expected val (f32@CPU_EP): -12
+// qdq@QNN_EP val: -11.011764526367188 (err: 0.9882354736328125, err/output_range: 4.1176481246948242%)
+// qdq@CPU_EP val: -12.047059059143066 (err: 0.047059059143066406, err/output_range: 0.19607941806316376%)
+// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 3.9215683937072754%
 // Test accuracy of QDQ Ceil op.
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+TEST_F(QnnHTPBackendTests, DISABLED_UnaryOp_Ceil) {
+#else
 TEST_F(QnnHTPBackendTests, UnaryOp_Ceil) {
+#endif
   const std::vector<float> input_data = GetFloatDataInRange(-12.0f, 12.0f, 6);
   RunQDQOpTest<uint8_t>("Ceil",
                         {TestInputDef<float>({1, 2, 3}, false, input_data)},
diff --git a/setup.py b/setup.py
index 6bfb53329f319..3347e68bd75a5 100644
--- a/setup.py
+++ b/setup.py
@@ -417,7 +417,7 @@ def finalize_options(self):
     libs.extend(["DirectML.dll"])
     # WebGPU/Dawn Libs
     libs.extend(["dxcompiler.dll", "dxil.dll"])
-    # QNN V68/V73 dependencies
+    # QNN V68/V73/V81 dependencies
     qnn_deps = [
         "QnnCpu.dll",
         "QnnGpu.dll",
@@ -425,6 +425,9 @@ def finalize_options(self):
         "QnnSaver.dll",
         "QnnSystem.dll",
         "QnnHtpPrepare.dll",
+        "QnnHtpV81Stub.dll",
+        "libQnnHtpV81Skel.so",
+        "libqnnhtpv81.cat",
         "QnnHtpV73Stub.dll",
         "libQnnHtpV73Skel.so",
         "libqnnhtpv73.cat",
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml
index 72343613d6b26..35b839e4f1b7f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows-qnn.yml
@@ -86,6 +86,7 @@ steps:
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpPrepare.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV68Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV73Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV81Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSaver.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSystem.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
index 3c1150269bdb0..28a1960aac27b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-artifacts-package-and-publish-steps-windows.yml
@@ -86,6 +86,7 @@ steps:
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpPrepare.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV68Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV73Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
+          copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnHtpV81Stub.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSaver.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
           copy $(Build.BinariesDirectory)\${{parameters.buildConfig}}\${{parameters.buildConfig}}\QnnSystem.dll $(Build.BinariesDirectory)\${{parameters.artifactName}}\lib
 
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index ead240a7cef1b..a29df2a0796a0 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -586,15 +586,28 @@ def generate_files(line_list, args):
             files_list.append(
                 "<file src=" + '"' + os.path.join(args.native_build_path, "QnnHtpPrepare.dll") + runtimes + " />"
             )
-            files_list.append(
-                "<file src=" + '"' + os.path.join(args.native_build_path, "QnnHtpV73Stub.dll") + runtimes + " />"
-            )
-            files_list.append(
-                "<file src=" + '"' + os.path.join(args.native_build_path, "libQnnHtpV73Skel.so") + runtimes + " />"
-            )
-            files_list.append(
-                "<file src=" + '"' + os.path.join(args.native_build_path, "libqnnhtpv73.cat") + runtimes + " />"
-            )
+            for htp_arch in [73, 81]:
+                files_list.append(
+                    "<file src="
+                    + '"'
+                    + os.path.join(args.native_build_path, f"QnnHtpV{htp_arch}Stub.dll")
+                    + runtimes
+                    + " />"
+                )
+                files_list.append(
+                    "<file src="
+                    + '"'
+                    + os.path.join(args.native_build_path, f"libQnnHtpV{htp_arch}Skel.so")
+                    + runtimes
+                    + " />"
+                )
+                files_list.append(
+                    "<file src="
+                    + '"'
+                    + os.path.join(args.native_build_path, f"libqnnhtpv{htp_arch}.cat")
+                    + runtimes
+                    + " />"
+                )
 
     is_ado_packaging_build = False
     # Process runtimes