revert masking vocab_size (#4089)

lvhan028 · web-flow · commit 60aa80e1f447 · 2025-10-31T20:34:40.000+08:00
* revert turbomind masking vocab size

* revert mask-vocab-size in pytorch engine

* fix typo

* update log
diff --git a/lmdeploy/pytorch/engine/guided_process.py b/lmdeploy/pytorch/engine/guided_process.py
@@ -10,7 +10,7 @@
 logger = logging.getLogger('lmdeploy')
 
 
-class GuidedDecodingMangager:
+class GuidedDecodingManager:
     processors = {}
 
     def __init__(self, tokenizer: PreTrainedTokenizerBase, vocab_size: Optional[int]):
diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
@@ -8,7 +8,7 @@
 from lmdeploy.messages import LogitsProcessor
 
 from ..messages import SchedulerSequence
-from .guided_process import GuidedDecodingMangager
+from .guided_process import GuidedDecodingManager
 
 
 def _process_temperature_(scores: torch.Tensor, temperature: torch.Tensor):
@@ -143,12 +143,10 @@ class FusedLogitsProcessor:
     def __init__(
         self,
         sampling_inputs: SamplingInputs,
-        sampling_vocab_size: Optional[int] = None,
         logprobs_mode: Optional[str] = None,
-        guided_decoding_manager: Optional[GuidedDecodingMangager] = None,
+        guided_decoding_manager: Optional[GuidedDecodingManager] = None,
     ):
         self.sampling_inputs: SamplingInputs = sampling_inputs
-        self.sampling_vocab_size = sampling_vocab_size
         self.logprobs_mode = logprobs_mode
         self.guided_decoding_manager = guided_decoding_manager
         if sampling_inputs.session_to_cleanup:
@@ -266,9 +264,6 @@ def __random_sampling(scores: torch.Tensor, indices: torch.LongTensor):
             offsets = sampling_inputs.random_offsets
             return _multinomial_sampling(softmax_scores, seeds, offsets, indices)
 
-        if self.sampling_vocab_size is not None and logits.size(1) > self.sampling_vocab_size:
-            logits = logits[..., :self.sampling_vocab_size]
-
         if sampling_inputs.max_top_k == 1:
             result = logits.argmax(-1)
         else:
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -30,7 +30,7 @@
 from ..utils import get_gpu_memory
 from ..weight_loader.model_weight_loader import load_model_weights
 from .cache_engine import CacheEngine
-from .guided_process import GuidedDecodingMangager
+from .guided_process import GuidedDecodingManager
 from .logits_process import FusedLogitsProcessor, SamplingInputs
 
 logger = get_logger('lmdeploy')
@@ -315,10 +315,6 @@ def __init__(self,
         self.cache_config = cache_config
         # use raw tokenizer
         self.tokenizer = Tokenizer(model_path).model.model
-        try:
-            self.sampling_vocab_size = len(self.tokenizer)
-        except BaseException:
-            self.sampling_vocab_size = None
 
         self._pre_in_que = None
         self._in_que = None
@@ -354,9 +350,9 @@ def __init__(self,
         self.cache_engine = None
         self.profiler: AgentProfiler = None
         try:
-            self.guided_decoding_manager = GuidedDecodingMangager(self.tokenizer, self.sampling_vocab_size)
+            self.guided_decoding_manager = GuidedDecodingManager(self.tokenizer, model_config.vocab_size)
         except ValueError as e:
-            logger.warning(f'Failed to create GuidedManager for tokenizer {self.tokenizer}: {e}')
+            logger.warning(f'Failed to create GuidedManager for tokenizer {type(self.tokenizer)}: {e}')
             self.guided_decoding_manager = None
 
         # microbatch
@@ -552,7 +548,6 @@ async def async_sampling_logits(self, logits: torch.Tensor, sampling_inputs: Sam
         with record_function('sampling_logits'):
             logits_processor = FusedLogitsProcessor(
                 sampling_inputs,
-                sampling_vocab_size=self.sampling_vocab_size,
                 logprobs_mode=self.misc_config.logprobs_mode,
                 guided_decoding_manager=self.guided_decoding_manager,
             )
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
@@ -54,9 +54,6 @@ class ModelConfig:
     # Therefore, we add a new attr "embedding_size" to represent the vocab dim
     # of token_embedding
     embedding_size: int = 0
-    # for some models like qwen2.5, the vocab size of the model is larger than
-    # the vocab size of the tokenizer.
-    tokenizer_size: int = None
     num_layer: int = None
     inter_size: List[int] = None
     norm_eps: float = None
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -101,10 +101,6 @@ def update_model_config(self):
         final_cfg.update(self.input_model_info)
         if 'embedding_size' not in self.input_model_info.keys():
             final_cfg.update(embedding_size=self.input_model_info['vocab_size'])
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(self.input_model.tokenizer_path, trust_remote_code=True)
-        tokenizer_size = min(len(tokenizer), final_cfg['vocab_size'])
-        final_cfg.update(tokenizer_size=tokenizer_size)
 
         self.model_config = config_from_dict(ModelConfig, final_cfg)
 
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
@@ -90,7 +90,7 @@ LlamaV2::LlamaV2(DataType                     dtype,
 
     // using float to avoid data overflow
     dynamic_decode_ = std::make_unique<DynamicDecodeLayer>(
-        kFloat32, max_batch_size, model.tokenizer_size, vocab_size_padded_, stream_, &ctx.device_prop);
+        kFloat32, max_batch_size, vocab_size_, vocab_size_padded_, stream_, &ctx.device_prop);
 }
 
 void LlamaV2::updateEmbedding(char*            decoder_input,
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
@@ -28,7 +28,6 @@ struct ModelParam {
     size_t   layer_num;
     size_t   vocab_size;
     size_t   embedding_size;
-    size_t   tokenizer_size;
     float    norm_eps;
     int      quant_policy;
     bool     attn_bias;
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -200,12 +200,6 @@ void LlamaTritonModel::handleMissingParams()
                        (int)model_param_.vocab_size);
     }
 
-    if (model_param_.tokenizer_size == 0) {
-        model_param_.tokenizer_size = model_param_.vocab_size;
-        TM_LOG_WARNING("[LlamaTritonModel] `tokenizer_size` is not set, default to `vocab_size` (%d).",
-                       (int)model_param_.vocab_size);
-    }
-
     if (!attn_param_.max_position_embeddings) {
         attn_param_.max_position_embeddings = 2048;
         TM_LOG_WARNING("[LlamaTritonModel] `max_position_embeddings` is not set, default to %d.",
@@ -322,7 +316,6 @@ LlamaTritonModel::LlamaTritonModel(std::string                            model_
     model_param_.layer_num          = model_reader["num_layer"].as<int>();
     model_param_.vocab_size         = model_reader["vocab_size"].as<int>();
     model_param_.embedding_size     = model_reader["embedding_size"].as<int>();
-    model_param_.tokenizer_size     = model_reader["tokenizer_size"].as<int>(0);
     model_param_.norm_eps           = model_reader["norm_eps"].as<float>();
     model_param_.tune_layer_num     = model_reader["tune_layer_num"].as<int>(1);
     model_param_.mla.q_lora_rank    = model_reader["q_lora_rank"].as<int>();

Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ LlamaV2::LlamaV2(DataType dtype,`
`90`	`90`
`91`	`91`	`// using float to avoid data overflow`
`92`	`92`	`dynamic_decode_ = std::make_unique<DynamicDecodeLayer>(`
`93`		`- kFloat32, max_batch_size, model.tokenizer_size, vocab_size_padded_, stream_, &ctx.device_prop);`
	`93`	`+ kFloat32, max_batch_size, vocab_size_, vocab_size_padded_, stream_, &ctx.device_prop);`
`94`	`94`	`}`
`95`	`95`
`96`	`96`	`void LlamaV2::updateEmbedding(char* decoder_input,`