Skip to content

Commit 60aa80e

Browse files
authored
revert masking vocab_size (#4089)
* revert turbomind masking vocab size * revert mask-vocab-size in pytorch engine * fix typo * update log
1 parent 832ac83 commit 60aa80e

File tree

8 files changed

+7
-32
lines changed

8 files changed

+7
-32
lines changed

lmdeploy/pytorch/engine/guided_process.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
logger = logging.getLogger('lmdeploy')
1111

1212

13-
class GuidedDecodingMangager:
13+
class GuidedDecodingManager:
1414
processors = {}
1515

1616
def __init__(self, tokenizer: PreTrainedTokenizerBase, vocab_size: Optional[int]):

lmdeploy/pytorch/engine/logits_process.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from lmdeploy.messages import LogitsProcessor
99

1010
from ..messages import SchedulerSequence
11-
from .guided_process import GuidedDecodingMangager
11+
from .guided_process import GuidedDecodingManager
1212

1313

1414
def _process_temperature_(scores: torch.Tensor, temperature: torch.Tensor):
@@ -143,12 +143,10 @@ class FusedLogitsProcessor:
143143
def __init__(
144144
self,
145145
sampling_inputs: SamplingInputs,
146-
sampling_vocab_size: Optional[int] = None,
147146
logprobs_mode: Optional[str] = None,
148-
guided_decoding_manager: Optional[GuidedDecodingMangager] = None,
147+
guided_decoding_manager: Optional[GuidedDecodingManager] = None,
149148
):
150149
self.sampling_inputs: SamplingInputs = sampling_inputs
151-
self.sampling_vocab_size = sampling_vocab_size
152150
self.logprobs_mode = logprobs_mode
153151
self.guided_decoding_manager = guided_decoding_manager
154152
if sampling_inputs.session_to_cleanup:
@@ -266,9 +264,6 @@ def __random_sampling(scores: torch.Tensor, indices: torch.LongTensor):
266264
offsets = sampling_inputs.random_offsets
267265
return _multinomial_sampling(softmax_scores, seeds, offsets, indices)
268266

269-
if self.sampling_vocab_size is not None and logits.size(1) > self.sampling_vocab_size:
270-
logits = logits[..., :self.sampling_vocab_size]
271-
272267
if sampling_inputs.max_top_k == 1:
273268
result = logits.argmax(-1)
274269
else:

lmdeploy/pytorch/engine/model_agent.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from ..utils import get_gpu_memory
3131
from ..weight_loader.model_weight_loader import load_model_weights
3232
from .cache_engine import CacheEngine
33-
from .guided_process import GuidedDecodingMangager
33+
from .guided_process import GuidedDecodingManager
3434
from .logits_process import FusedLogitsProcessor, SamplingInputs
3535

3636
logger = get_logger('lmdeploy')
@@ -315,10 +315,6 @@ def __init__(self,
315315
self.cache_config = cache_config
316316
# use raw tokenizer
317317
self.tokenizer = Tokenizer(model_path).model.model
318-
try:
319-
self.sampling_vocab_size = len(self.tokenizer)
320-
except BaseException:
321-
self.sampling_vocab_size = None
322318

323319
self._pre_in_que = None
324320
self._in_que = None
@@ -354,9 +350,9 @@ def __init__(self,
354350
self.cache_engine = None
355351
self.profiler: AgentProfiler = None
356352
try:
357-
self.guided_decoding_manager = GuidedDecodingMangager(self.tokenizer, self.sampling_vocab_size)
353+
self.guided_decoding_manager = GuidedDecodingManager(self.tokenizer, model_config.vocab_size)
358354
except ValueError as e:
359-
logger.warning(f'Failed to create GuidedManager for tokenizer {self.tokenizer}: {e}')
355+
logger.warning(f'Failed to create GuidedManager for tokenizer {type(self.tokenizer)}: {e}')
360356
self.guided_decoding_manager = None
361357

362358
# microbatch
@@ -552,7 +548,6 @@ async def async_sampling_logits(self, logits: torch.Tensor, sampling_inputs: Sam
552548
with record_function('sampling_logits'):
553549
logits_processor = FusedLogitsProcessor(
554550
sampling_inputs,
555-
sampling_vocab_size=self.sampling_vocab_size,
556551
logprobs_mode=self.misc_config.logprobs_mode,
557552
guided_decoding_manager=self.guided_decoding_manager,
558553
)

lmdeploy/turbomind/deploy/config.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,6 @@ class ModelConfig:
5454
# Therefore, we add a new attr "embedding_size" to represent the vocab dim
5555
# of token_embedding
5656
embedding_size: int = 0
57-
# for some models like qwen2.5, the vocab size of the model is larger than
58-
# the vocab size of the tokenizer.
59-
tokenizer_size: int = None
6057
num_layer: int = None
6158
inter_size: List[int] = None
6259
norm_eps: float = None

lmdeploy/turbomind/deploy/target_model/base.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,6 @@ def update_model_config(self):
101101
final_cfg.update(self.input_model_info)
102102
if 'embedding_size' not in self.input_model_info.keys():
103103
final_cfg.update(embedding_size=self.input_model_info['vocab_size'])
104-
from transformers import AutoTokenizer
105-
tokenizer = AutoTokenizer.from_pretrained(self.input_model.tokenizer_path, trust_remote_code=True)
106-
tokenizer_size = min(len(tokenizer), final_cfg['vocab_size'])
107-
final_cfg.update(tokenizer_size=tokenizer_size)
108104

109105
self.model_config = config_from_dict(ModelConfig, final_cfg)
110106

src/turbomind/models/llama/LlamaV2.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ LlamaV2::LlamaV2(DataType dtype,
9090

9191
// using float to avoid data overflow
9292
dynamic_decode_ = std::make_unique<DynamicDecodeLayer>(
93-
kFloat32, max_batch_size, model.tokenizer_size, vocab_size_padded_, stream_, &ctx.device_prop);
93+
kFloat32, max_batch_size, vocab_size_, vocab_size_padded_, stream_, &ctx.device_prop);
9494
}
9595

9696
void LlamaV2::updateEmbedding(char* decoder_input,

src/turbomind/models/llama/llama_params.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ struct ModelParam {
2828
size_t layer_num;
2929
size_t vocab_size;
3030
size_t embedding_size;
31-
size_t tokenizer_size;
3231
float norm_eps;
3332
int quant_policy;
3433
bool attn_bias;

src/turbomind/triton_backend/llama/LlamaTritonModel.cc

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -200,12 +200,6 @@ void LlamaTritonModel::handleMissingParams()
200200
(int)model_param_.vocab_size);
201201
}
202202

203-
if (model_param_.tokenizer_size == 0) {
204-
model_param_.tokenizer_size = model_param_.vocab_size;
205-
TM_LOG_WARNING("[LlamaTritonModel] `tokenizer_size` is not set, default to `vocab_size` (%d).",
206-
(int)model_param_.vocab_size);
207-
}
208-
209203
if (!attn_param_.max_position_embeddings) {
210204
attn_param_.max_position_embeddings = 2048;
211205
TM_LOG_WARNING("[LlamaTritonModel] `max_position_embeddings` is not set, default to %d.",
@@ -322,7 +316,6 @@ LlamaTritonModel::LlamaTritonModel(std::string model_
322316
model_param_.layer_num = model_reader["num_layer"].as<int>();
323317
model_param_.vocab_size = model_reader["vocab_size"].as<int>();
324318
model_param_.embedding_size = model_reader["embedding_size"].as<int>();
325-
model_param_.tokenizer_size = model_reader["tokenizer_size"].as<int>(0);
326319
model_param_.norm_eps = model_reader["norm_eps"].as<float>();
327320
model_param_.tune_layer_num = model_reader["tune_layer_num"].as<int>(1);
328321
model_param_.mla.q_lora_rank = model_reader["q_lora_rank"].as<int>();

0 commit comments

Comments
 (0)