From da0e2bf3173569da476f92bf756862484581ac7f Mon Sep 17 00:00:00 2001
From: l30074184 <liujiewen2@h-partners.com>
Date: Thu, 17 Jul 2025 09:51:49 +0800
Subject: [PATCH 1/7] [Feature] Enable inference support for
 Deepseekr1-w8a8-MTP

Signed-off-by: l30074184 <liujiewen2@h-partners.com>
---
 vllm_ascend/models/deepseek_mtp.py       |  23 ++++-
 vllm_ascend/models/deepseek_v2.py        |   4 +-
 vllm_ascend/quantization/quant_config.py | 112 ++++++++++++++---------
 3 files changed, 91 insertions(+), 48 deletions(-)

diff --git a/vllm_ascend/models/deepseek_mtp.py b/vllm_ascend/models/deepseek_mtp.py
index 979a6099f1..400c7a0acf 100644
--- a/vllm_ascend/models/deepseek_mtp.py
+++ b/vllm_ascend/models/deepseek_mtp.py
@@ -28,8 +28,8 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import \
-    VocabParallelEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.models.deepseek_mtp import (
     DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer,
     SharedHead)
@@ -40,6 +40,20 @@
 from .deepseek_v2 import CustomDeepseekV2DecoderLayer
 
 
+class CustomDeepSeekShareHead(SharedHead):
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        nn.Module.__init__(self)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.head = ParallelLMHead(config.vocab_size,
+                                   config.hidden_size,
+                                   quant_config=quant_config,
+                                   prefix=maybe_prefix(prefix, "head"))
+
+
 class CustomDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer):
 
     def __init__(
@@ -61,7 +75,10 @@ def __init__(
         self.eh_proj = nn.Linear(config.hidden_size * 2,
                                  config.hidden_size,
                                  bias=False)
-        self.shared_head = SharedHead(config=config, quant_config=quant_config)
+        self.shared_head = CustomDeepSeekShareHead(config=config,
+                                                   quant_config=quant_config,
+                                                   prefix=maybe_prefix(
+                                                       prefix, "shared_head"))
         self.mtp_block = CustomDeepseekV2DecoderLayer(config, prefix,
                                                       model_config,
                                                       cache_config,
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
index d7f68a12c7..fa542e0afe 100644
--- a/vllm_ascend/models/deepseek_v2.py
+++ b/vllm_ascend/models/deepseek_v2.py
@@ -861,7 +861,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(config.vocab_size,
                                           config.hidden_size,
-                                          quant_config=quant_config)
+                                          quant_config=quant_config,
+                                          prefix=maybe_prefix(
+                                              prefix, "lm_head"))
         else:
             self.lm_head = PPMissingLayer()
         self.logits_processor = LogitsProcessor(config.vocab_size)
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index 7c7ee58033..ada75ed80b 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -34,6 +34,8 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    UnquantizedEmbeddingMethod, VocabParallelEmbedding)
 from vllm.model_executor.parameter import PerTensorScaleParameter
 from vllm.model_executor.utils import set_weight_attrs
 
@@ -46,7 +48,7 @@
 @register_quantization_config(ASCEND_QUATIZATION_METHOD)
 class AscendQuantConfig(QuantizationConfig):
     """Config class for Ascend
-    
+
     This class is a general class that parse quantization configs
     that are supported on ascend hardware.
     """
@@ -95,8 +97,8 @@ def get_quant_method(self, layer: torch.nn.Module,
             return AscendLinearMethod(self, prefix,
                                       self.packed_modules_mapping)
         elif isinstance(layer, Attention) and \
-            'fa_quant_type' in self.quant_description.keys() and \
-            self.quant_description['fa_quant_type'] is not None:
+                'fa_quant_type' in self.quant_description.keys() and \
+                self.quant_description['fa_quant_type'] is not None:
             return AscendKVCacheMethod(self, prefix)
         elif isinstance(layer, Attention) and self.quant_description.get(
                 'kv_quant_type') == 'C8':
@@ -107,12 +109,18 @@ def get_quant_method(self, layer: torch.nn.Module,
                 return AscendUnquantizedFusedMoEMethod()
             return AscendFusedMoEMethod(self, prefix,
                                         self.packed_modules_mapping)
+        elif isinstance(layer, VocabParallelEmbedding):
+            if self.is_layer_skipped_ascend(prefix,
+                                            self.packed_modules_mapping):
+                return UnquantizedEmbeddingMethod()
+            return AscendEmbeddingMethod(self, prefix,
+                                         self.packed_modules_mapping)
         return None
 
     def is_layer_skipped_ascend(
-        self,
-        prefix: str,
-        fused_mapping: Mapping[str, List[str]] = MappingProxyType({})):
+            self,
+            prefix: str,
+            fused_mapping: Mapping[str, List[str]] = MappingProxyType({})):
         # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
         proj_name = prefix.split(".")[-1]
         if proj_name in fused_mapping:
@@ -160,14 +168,14 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str,
         self.quant_method = self.quantizer.build_linear_method()
 
     def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: List[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
+            self,
+            layer: torch.nn.Module,
+            input_size_per_partition: int,
+            output_partition_sizes: List[int],
+            input_size: int,
+            output_size: int,
+            params_dtype: torch.dtype,
+            **extra_weight_attrs,
     ) -> None:
         output_size_per_partition = sum(output_partition_sizes)
         weight_loader = extra_weight_attrs.get("weight_loader")
@@ -202,10 +210,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             self.quant_method.process_weights_after_loading(layer)
 
     def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if isinstance(layer, RowParallelLinear):
             tp_rank = get_tensor_model_parallel_rank()
@@ -262,13 +270,13 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str,
         self.quant_method = self.quantizer.build_moe_method()
 
     def create_weights(
-        self,
-        layer: torch.nn.Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
+            self,
+            layer: torch.nn.Module,
+            num_experts: int,
+            hidden_size: int,
+            intermediate_size_per_partition: int,
+            params_dtype: torch.dtype,
+            **extra_weight_attrs,
     ) -> None:
         weight_param = self.quant_method.get_weight(
             num_experts, intermediate_size_per_partition, hidden_size,
@@ -289,25 +297,25 @@ def create_weights(
             set_weight_attrs(param, extra_weight_attrs)
 
     def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        is_prefill: bool = True,
-        enable_force_load_balance: bool = False,
-        log2phy: torch.Tensor = None,
-        global_redundant_expert_num=0,
-        **kwargs,
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            router_logits: torch.Tensor,
+            top_k: int,
+            renormalize: bool,
+            use_grouped_topk: bool = False,
+            global_num_experts: int = -1,
+            expert_map: Optional[torch.Tensor] = None,
+            topk_group: Optional[int] = None,
+            num_expert_group: Optional[int] = None,
+            custom_routing_function: Optional[Callable] = None,
+            scoring_func: str = "softmax",
+            e_score_correction_bias: Optional[torch.Tensor] = None,
+            is_prefill: bool = True,
+            enable_force_load_balance: bool = False,
+            log2phy: torch.Tensor = None,
+            global_redundant_expert_num=0,
+            **kwargs,
     ) -> torch.Tensor:
         return self.quant_method.apply(
             layer, x, router_logits, top_k, renormalize, use_grouped_topk,
@@ -319,3 +327,19 @@ def apply(
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if hasattr(self.quant_method, "process_weights_after_loading"):
             self.quant_method.process_weights_after_loading(layer)
+
+class AscendEmbeddingMethod(AscendLinearMethod):
+    """Embedding method for Ascend quantization.
+
+      This class calls AscendQuantizer to search a specific quantization
+      implementations supported on ascend hardware for Embedding methods.
+
+      Args:
+          quant_config: The Ascend quantization config.
+    """
+
+    def __init__(self, quant_config: AscendQuantConfig, prefix: str,
+                 packed_modules_mapping: Dict[str, Any]) -> None:
+        self.quantizer = AscendQuantizer.get_quantizer(
+            quant_config.quant_description, prefix, packed_modules_mapping)
+        self.quant_method = self.quantizer.build_linear_method()
\ No newline at end of file

From b44f0c2a40ee91b363e3bacce5f6adda43e0db51 Mon Sep 17 00:00:00 2001
From: l30074184 <liujiewen2@h-partners.com>
Date: Thu, 17 Jul 2025 09:57:12 +0800
Subject: [PATCH 2/7] codecheck fixed

Signed-off-by: l30074184 <liujiewen2@h-partners.com>
---
 vllm_ascend/quantization/quant_config.py | 86 ++++++++++++------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index ada75ed80b..ee76d2f522 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -97,8 +97,8 @@ def get_quant_method(self, layer: torch.nn.Module,
             return AscendLinearMethod(self, prefix,
                                       self.packed_modules_mapping)
         elif isinstance(layer, Attention) and \
-                'fa_quant_type' in self.quant_description.keys() and \
-                self.quant_description['fa_quant_type'] is not None:
+            'fa_quant_type' in self.quant_description.keys() and \
+             self.quant_description['fa_quant_type'] is not None:
             return AscendKVCacheMethod(self, prefix)
         elif isinstance(layer, Attention) and self.quant_description.get(
                 'kv_quant_type') == 'C8':
@@ -118,9 +118,9 @@ def get_quant_method(self, layer: torch.nn.Module,
         return None
 
     def is_layer_skipped_ascend(
-            self,
-            prefix: str,
-            fused_mapping: Mapping[str, List[str]] = MappingProxyType({})):
+        self,
+        prefix: str,
+        fused_mapping: Mapping[str, List[str]] = MappingProxyType({})):
         # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
         proj_name = prefix.split(".")[-1]
         if proj_name in fused_mapping:
@@ -168,14 +168,14 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str,
         self.quant_method = self.quantizer.build_linear_method()
 
     def create_weights(
-            self,
-            layer: torch.nn.Module,
-            input_size_per_partition: int,
-            output_partition_sizes: List[int],
-            input_size: int,
-            output_size: int,
-            params_dtype: torch.dtype,
-            **extra_weight_attrs,
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
     ) -> None:
         output_size_per_partition = sum(output_partition_sizes)
         weight_loader = extra_weight_attrs.get("weight_loader")
@@ -210,10 +210,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             self.quant_method.process_weights_after_loading(layer)
 
     def apply(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            bias: Optional[torch.Tensor] = None,
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if isinstance(layer, RowParallelLinear):
             tp_rank = get_tensor_model_parallel_rank()
@@ -270,13 +270,13 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str,
         self.quant_method = self.quantizer.build_moe_method()
 
     def create_weights(
-            self,
-            layer: torch.nn.Module,
-            num_experts: int,
-            hidden_size: int,
-            intermediate_size_per_partition: int,
-            params_dtype: torch.dtype,
-            **extra_weight_attrs,
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
     ) -> None:
         weight_param = self.quant_method.get_weight(
             num_experts, intermediate_size_per_partition, hidden_size,
@@ -297,25 +297,25 @@ def create_weights(
             set_weight_attrs(param, extra_weight_attrs)
 
     def apply(
-            self,
-            layer: torch.nn.Module,
-            x: torch.Tensor,
-            router_logits: torch.Tensor,
-            top_k: int,
-            renormalize: bool,
-            use_grouped_topk: bool = False,
-            global_num_experts: int = -1,
-            expert_map: Optional[torch.Tensor] = None,
-            topk_group: Optional[int] = None,
-            num_expert_group: Optional[int] = None,
-            custom_routing_function: Optional[Callable] = None,
-            scoring_func: str = "softmax",
-            e_score_correction_bias: Optional[torch.Tensor] = None,
-            is_prefill: bool = True,
-            enable_force_load_balance: bool = False,
-            log2phy: torch.Tensor = None,
-            global_redundant_expert_num=0,
-            **kwargs,
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        is_prefill: bool = True,
+        enable_force_load_balance: bool = False,
+        log2phy: torch.Tensor = None,
+        global_redundant_expert_num=0,
+        **kwargs,
     ) -> torch.Tensor:
         return self.quant_method.apply(
             layer, x, router_logits, top_k, renormalize, use_grouped_topk,

From 2b40113e8ef7dfa7971ddb9102fdb7930d42ea3a Mon Sep 17 00:00:00 2001
From: l30074184 <liujiewen2@h-partners.com>
Date: Thu, 17 Jul 2025 10:00:21 +0800
Subject: [PATCH 3/7] codecheck fixed2

Signed-off-by: l30074184 <liujiewen2@h-partners.com>
---
 vllm_ascend/quantization/quant_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index ee76d2f522..ce1841463f 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -98,7 +98,7 @@ def get_quant_method(self, layer: torch.nn.Module,
                                       self.packed_modules_mapping)
         elif isinstance(layer, Attention) and \
             'fa_quant_type' in self.quant_description.keys() and \
-             self.quant_description['fa_quant_type'] is not None:
+            self.quant_description['fa_quant_type'] is not None:
             return AscendKVCacheMethod(self, prefix)
         elif isinstance(layer, Attention) and self.quant_description.get(
                 'kv_quant_type') == 'C8':

From 2cf885e87a247e78d58589ef530c0e52a102d1c1 Mon Sep 17 00:00:00 2001
From: l30074184 <liujiewen2@h-partners.com>
Date: Thu, 17 Jul 2025 10:27:40 +0800
Subject: [PATCH 4/7] codecheck fixed2

Signed-off-by: l30074184 <liujiewen2@h-partners.com>
---
 vllm_ascend/quantization/quant_config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index ce1841463f..5de2a8886b 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -342,4 +342,5 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str,
                  packed_modules_mapping: Dict[str, Any]) -> None:
         self.quantizer = AscendQuantizer.get_quantizer(
             quant_config.quant_description, prefix, packed_modules_mapping)
-        self.quant_method = self.quantizer.build_linear_method()
\ No newline at end of file
+        self.quant_method = self.quantizer.build_linear_method()
+        
\ No newline at end of file

From 185fc5f3d64d869feef63354afc2c5f51c9f5773 Mon Sep 17 00:00:00 2001
From: l30074184 <liujiewen2@h-partners.com>
Date: Thu, 17 Jul 2025 10:56:49 +0800
Subject: [PATCH 5/7] codecheck fixed3

Signed-off-by: l30074184 <liujiewen2@h-partners.com>
---
 vllm_ascend/quantization/quant_config.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index 5de2a8886b..f61d8306a4 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -343,4 +343,3 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str,
         self.quantizer = AscendQuantizer.get_quantizer(
             quant_config.quant_description, prefix, packed_modules_mapping)
         self.quant_method = self.quantizer.build_linear_method()
-        
\ No newline at end of file

From 31008d45fa61d32a754299e3906418e3d0040d30 Mon Sep 17 00:00:00 2001
From: l30074184 <liujiewen2@h-partners.com>
Date: Thu, 17 Jul 2025 14:37:08 +0800
Subject: [PATCH 6/7] codecheck fixed4

Signed-off-by: l30074184 <liujiewen2@h-partners.com>
---
 vllm_ascend/quantization/quant_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
index f61d8306a4..0329d278fe 100644
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -328,6 +328,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if hasattr(self.quant_method, "process_weights_after_loading"):
             self.quant_method.process_weights_after_loading(layer)
 
+
 class AscendEmbeddingMethod(AscendLinearMethod):
     """Embedding method for Ascend quantization.
 

From a67bc730ccec421b8263dd47a409989284d88c75 Mon Sep 17 00:00:00 2001
From: l30074184 <liujiewen2@h-partners.com>
Date: Fri, 18 Jul 2025 11:15:49 +0800
Subject: [PATCH 7/7] trigger CI rerun

Signed-off-by: l30074184 <liujiewen2@h-partners.com>