From da0e2bf3173569da476f92bf756862484581ac7f Mon Sep 17 00:00:00 2001 From: l30074184 Date: Thu, 17 Jul 2025 09:51:49 +0800 Subject: [PATCH 1/7] [Feature] Enable inference support for Deepseekr1-w8a8-MTP Signed-off-by: l30074184 --- vllm_ascend/models/deepseek_mtp.py | 23 ++++- vllm_ascend/models/deepseek_v2.py | 4 +- vllm_ascend/quantization/quant_config.py | 112 ++++++++++++++--------- 3 files changed, 91 insertions(+), 48 deletions(-) diff --git a/vllm_ascend/models/deepseek_mtp.py b/vllm_ascend/models/deepseek_mtp.py index 979a6099f1..400c7a0acf 100644 --- a/vllm_ascend/models/deepseek_mtp.py +++ b/vllm_ascend/models/deepseek_mtp.py @@ -28,8 +28,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import get_sampler -from vllm.model_executor.layers.vocab_parallel_embedding import \ - VocabParallelEmbedding +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.models.deepseek_mtp import ( DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer, SharedHead) @@ -40,6 +40,20 @@ from .deepseek_v2 import CustomDeepseekV2DecoderLayer +class CustomDeepSeekShareHead(SharedHead): + + def __init__(self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: + nn.Module.__init__(self) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "head")) + + class CustomDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer): def __init__( @@ -61,7 +75,10 @@ def __init__( self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False) - self.shared_head = SharedHead(config=config, quant_config=quant_config) + self.shared_head = CustomDeepSeekShareHead(config=config, + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "shared_head")) self.mtp_block = CustomDeepseekV2DecoderLayer(config, prefix, model_config, cache_config, diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index d7f68a12c7..fa542e0afe 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -861,7 +861,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if get_pp_group().is_last_rank: self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "lm_head")) else: self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index 7c7ee58033..ada75ed80b 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -34,6 +34,8 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.vocab_parallel_embedding import ( + UnquantizedEmbeddingMethod, VocabParallelEmbedding) from vllm.model_executor.parameter import PerTensorScaleParameter from vllm.model_executor.utils import set_weight_attrs @@ -46,7 +48,7 @@ @register_quantization_config(ASCEND_QUATIZATION_METHOD) class AscendQuantConfig(QuantizationConfig): """Config class for Ascend - + This class is a general class that parse quantization configs that are supported on ascend hardware. """ @@ -95,8 +97,8 @@ def get_quant_method(self, layer: torch.nn.Module, return AscendLinearMethod(self, prefix, self.packed_modules_mapping) elif isinstance(layer, Attention) and \ - 'fa_quant_type' in self.quant_description.keys() and \ - self.quant_description['fa_quant_type'] is not None: + 'fa_quant_type' in self.quant_description.keys() and \ + self.quant_description['fa_quant_type'] is not None: return AscendKVCacheMethod(self, prefix) elif isinstance(layer, Attention) and self.quant_description.get( 'kv_quant_type') == 'C8': @@ -107,12 +109,18 @@ def get_quant_method(self, layer: torch.nn.Module, return AscendUnquantizedFusedMoEMethod() return AscendFusedMoEMethod(self, prefix, self.packed_modules_mapping) + elif isinstance(layer, VocabParallelEmbedding): + if self.is_layer_skipped_ascend(prefix, + self.packed_modules_mapping): + return UnquantizedEmbeddingMethod() + return AscendEmbeddingMethod(self, prefix, + self.packed_modules_mapping) return None def is_layer_skipped_ascend( - self, - prefix: str, - fused_mapping: Mapping[str, List[str]] = MappingProxyType({})): + self, + prefix: str, + fused_mapping: Mapping[str, List[str]] = MappingProxyType({})): # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped proj_name = prefix.split(".")[-1] if proj_name in fused_mapping: @@ -160,14 +168,14 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str, self.quant_method = self.quantizer.build_linear_method() def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: List[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, ) -> None: output_size_per_partition = sum(output_partition_sizes) weight_loader = extra_weight_attrs.get("weight_loader") @@ -202,10 +210,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: self.quant_method.process_weights_after_loading(layer) def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: if isinstance(layer, RowParallelLinear): tp_rank = get_tensor_model_parallel_rank() @@ -262,13 +270,13 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str, self.quant_method = self.quantizer.build_moe_method() def create_weights( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - params_dtype: torch.dtype, - **extra_weight_attrs, + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, ) -> None: weight_param = self.quant_method.get_weight( num_experts, intermediate_size_per_partition, hidden_size, @@ -289,25 +297,25 @@ def create_weights( set_weight_attrs(param, extra_weight_attrs) def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - topk_group: Optional[int] = None, - num_expert_group: Optional[int] = None, - custom_routing_function: Optional[Callable] = None, - scoring_func: str = "softmax", - e_score_correction_bias: Optional[torch.Tensor] = None, - is_prefill: bool = True, - enable_force_load_balance: bool = False, - log2phy: torch.Tensor = None, - global_redundant_expert_num=0, - **kwargs, + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + is_prefill: bool = True, + enable_force_load_balance: bool = False, + log2phy: torch.Tensor = None, + global_redundant_expert_num=0, + **kwargs, ) -> torch.Tensor: return self.quant_method.apply( layer, x, router_logits, top_k, renormalize, use_grouped_topk, @@ -319,3 +327,19 @@ def apply( def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if hasattr(self.quant_method, "process_weights_after_loading"): self.quant_method.process_weights_after_loading(layer) + +class AscendEmbeddingMethod(AscendLinearMethod): + """Embedding method for Ascend quantization. + + This class calls AscendQuantizer to search a specific quantization + implementations supported on ascend hardware for Embedding methods. + + Args: + quant_config: The Ascend quantization config. + """ + + def __init__(self, quant_config: AscendQuantConfig, prefix: str, + packed_modules_mapping: Dict[str, Any]) -> None: + self.quantizer = AscendQuantizer.get_quantizer( + quant_config.quant_description, prefix, packed_modules_mapping) + self.quant_method = self.quantizer.build_linear_method() \ No newline at end of file From b44f0c2a40ee91b363e3bacce5f6adda43e0db51 Mon Sep 17 00:00:00 2001 From: l30074184 Date: Thu, 17 Jul 2025 09:57:12 +0800 Subject: [PATCH 2/7] codecheck fixed Signed-off-by: l30074184 --- vllm_ascend/quantization/quant_config.py | 86 ++++++++++++------------ 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index ada75ed80b..ee76d2f522 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -97,8 +97,8 @@ def get_quant_method(self, layer: torch.nn.Module, return AscendLinearMethod(self, prefix, self.packed_modules_mapping) elif isinstance(layer, Attention) and \ - 'fa_quant_type' in self.quant_description.keys() and \ - self.quant_description['fa_quant_type'] is not None: + 'fa_quant_type' in self.quant_description.keys() and \ + self.quant_description['fa_quant_type'] is not None: return AscendKVCacheMethod(self, prefix) elif isinstance(layer, Attention) and self.quant_description.get( 'kv_quant_type') == 'C8': @@ -118,9 +118,9 @@ def get_quant_method(self, layer: torch.nn.Module, return None def is_layer_skipped_ascend( - self, - prefix: str, - fused_mapping: Mapping[str, List[str]] = MappingProxyType({})): + self, + prefix: str, + fused_mapping: Mapping[str, List[str]] = MappingProxyType({})): # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped proj_name = prefix.split(".")[-1] if proj_name in fused_mapping: @@ -168,14 +168,14 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str, self.quant_method = self.quantizer.build_linear_method() def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: List[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, ) -> None: output_size_per_partition = sum(output_partition_sizes) weight_loader = extra_weight_attrs.get("weight_loader") @@ -210,10 +210,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: self.quant_method.process_weights_after_loading(layer) def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None, + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, ) -> torch.Tensor: if isinstance(layer, RowParallelLinear): tp_rank = get_tensor_model_parallel_rank() @@ -270,13 +270,13 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str, self.quant_method = self.quantizer.build_moe_method() def create_weights( - self, - layer: torch.nn.Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - params_dtype: torch.dtype, - **extra_weight_attrs, + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, ) -> None: weight_param = self.quant_method.get_weight( num_experts, intermediate_size_per_partition, hidden_size, @@ -297,25 +297,25 @@ def create_weights( set_weight_attrs(param, extra_weight_attrs) def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - topk_group: Optional[int] = None, - num_expert_group: Optional[int] = None, - custom_routing_function: Optional[Callable] = None, - scoring_func: str = "softmax", - e_score_correction_bias: Optional[torch.Tensor] = None, - is_prefill: bool = True, - enable_force_load_balance: bool = False, - log2phy: torch.Tensor = None, - global_redundant_expert_num=0, - **kwargs, + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + is_prefill: bool = True, + enable_force_load_balance: bool = False, + log2phy: torch.Tensor = None, + global_redundant_expert_num=0, + **kwargs, ) -> torch.Tensor: return self.quant_method.apply( layer, x, router_logits, top_k, renormalize, use_grouped_topk, From 2b40113e8ef7dfa7971ddb9102fdb7930d42ea3a Mon Sep 17 00:00:00 2001 From: l30074184 Date: Thu, 17 Jul 2025 10:00:21 +0800 Subject: [PATCH 3/7] codecheck fixed2 Signed-off-by: l30074184 --- vllm_ascend/quantization/quant_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index ee76d2f522..ce1841463f 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -98,7 +98,7 @@ def get_quant_method(self, layer: torch.nn.Module, self.packed_modules_mapping) elif isinstance(layer, Attention) and \ 'fa_quant_type' in self.quant_description.keys() and \ - self.quant_description['fa_quant_type'] is not None: + self.quant_description['fa_quant_type'] is not None: return AscendKVCacheMethod(self, prefix) elif isinstance(layer, Attention) and self.quant_description.get( 'kv_quant_type') == 'C8': From 2cf885e87a247e78d58589ef530c0e52a102d1c1 Mon Sep 17 00:00:00 2001 From: l30074184 Date: Thu, 17 Jul 2025 10:27:40 +0800 Subject: [PATCH 4/7] codecheck fixed2 Signed-off-by: l30074184 --- vllm_ascend/quantization/quant_config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index ce1841463f..5de2a8886b 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -342,4 +342,5 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str, packed_modules_mapping: Dict[str, Any]) -> None: self.quantizer = AscendQuantizer.get_quantizer( quant_config.quant_description, prefix, packed_modules_mapping) - self.quant_method = self.quantizer.build_linear_method() \ No newline at end of file + self.quant_method = self.quantizer.build_linear_method() + \ No newline at end of file From 185fc5f3d64d869feef63354afc2c5f51c9f5773 Mon Sep 17 00:00:00 2001 From: l30074184 Date: Thu, 17 Jul 2025 10:56:49 +0800 Subject: [PATCH 5/7] codecheck fixed3 Signed-off-by: l30074184 --- vllm_ascend/quantization/quant_config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index 5de2a8886b..f61d8306a4 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -343,4 +343,3 @@ def __init__(self, quant_config: AscendQuantConfig, prefix: str, self.quantizer = AscendQuantizer.get_quantizer( quant_config.quant_description, prefix, packed_modules_mapping) self.quant_method = self.quantizer.build_linear_method() - \ No newline at end of file From 31008d45fa61d32a754299e3906418e3d0040d30 Mon Sep 17 00:00:00 2001 From: l30074184 Date: Thu, 17 Jul 2025 14:37:08 +0800 Subject: [PATCH 6/7] codecheck fixed4 Signed-off-by: l30074184 --- vllm_ascend/quantization/quant_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index f61d8306a4..0329d278fe 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -328,6 +328,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if hasattr(self.quant_method, "process_weights_after_loading"): self.quant_method.process_weights_after_loading(layer) + class AscendEmbeddingMethod(AscendLinearMethod): """Embedding method for Ascend quantization. From a67bc730ccec421b8263dd47a409989284d88c75 Mon Sep 17 00:00:00 2001 From: l30074184 Date: Fri, 18 Jul 2025 11:15:49 +0800 Subject: [PATCH 7/7] trigger CI rerun Signed-off-by: l30074184