From 7166f1b63793b825185958cf2527496f5a65cda0 Mon Sep 17 00:00:00 2001 From: pco111 <15262555+pco111@user.noreply.gitee.com> Date: Fri, 18 Jul 2025 12:12:09 -0400 Subject: [PATCH 01/10] feat(tokenization): add encode_message to tokenize messages one by one --- src/transformers/tokenization_utils_base.py | 81 +++++++++++++++++++ tests/tokenization/test_tokenization_utils.py | 23 ++++++ 2 files changed, 104 insertions(+) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index a8d0336b4663..996255edd729 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1694,6 +1694,87 @@ def apply_chat_template( else: return rendered_chat + def _encode_message( + self, + message: dict[str, str], + conversation_history: list[dict[str, str]], + add_generation_prompt: bool, + **kwargs, + ) -> list[int]: + """ + Helper function to encode a single message. + """ + conversation = conversation_history + [message] + tokens = self.apply_chat_template( + conversation, add_generation_prompt=add_generation_prompt, tokenize=True, **kwargs + ) + + # Now we need to find the beginning of the last message in the token stream. + # We do this by tokenizing the conversation *without* the last message and seeing where the token streams differ. + if len(conversation_history) > 0: + prefix_tokens = self.apply_chat_template( + conversation_history, add_generation_prompt=add_generation_prompt, tokenize=True, **kwargs + ) + # It's possible that the prefix tokens are not a prefix of the full list of tokens. + # For example, if the prefix is `User: Hi` and the full conversation is `User: HiAssistant: Hello`. + # In this case, we can't simply find the prefix, so we have to do something a bit more subtle. + # We look for the first place where the tokens differ, and that's our split point. + # This is not perfect, but it's the best we can do without a token-level API. + # To make this more robust, we could do a diff and find the longest common subsequence, but this is + # a good first approximation. + # This is particularly important for models like Llama3 that have changed their chat template to include + # EOS tokens after user messages. + min_len = min(len(prefix_tokens), len(tokens)) + for i in range(min_len): + if prefix_tokens[i] != tokens[i]: + return tokens[i:] + return tokens[min_len:] + else: + return tokens + + def encode_message( + self, + message: dict[str, str], + conversation_history: Optional[list[dict[str, str]]] = None, + add_generation_prompt: bool = False, + **kwargs, + ) -> list[int]: + """ + Tokenize a single message. This method is a convenience wrapper around `apply_chat_template` that allows you + to tokenize messages one by one. This is useful for things like token-by-token streaming. + + This method is not guaranteed to be perfect. For some models, it may be impossible to robustly tokenize + single messages. For example, if the chat template adds tokens after each message, but also has a prefix that + is added to the entire chat, it will be impossible to distinguish a chat-start-token from a message-start-token. + In these cases, this method will do its best to find the correct tokenization, but it may not be perfect. + + Args: + message (`dict`): + A dictionary with "role" and "content" keys, representing the message to tokenize. + conversation_history (`list[dict]`, *optional*): + A list of dicts with "role" and "content" keys, representing the chat history so far. If you are + tokenizing messages one by one, you should pass the previous messages in the conversation here. + add_generation_prompt (bool, *optional*): + If this is set, a prompt with the token(s) that indicate the start of an assistant message will be + appended to the formatted output. This is useful when you want to generate a response from the model. + Note that this argument will be passed to the chat template, and so it must be supported in the + template for this argument to have any effect. + **kwargs: + Additional kwargs to pass to the `apply_chat_template` method. + + Returns: + `list[int]`: A list of token ids representing the tokenized message. + """ + if conversation_history is None: + conversation_history = [] + + return self._encode_message( + message=message, + conversation_history=conversation_history, + add_generation_prompt=add_generation_prompt, + **kwargs, + ) + def get_chat_template(self, chat_template: Optional[str] = None, tools: Optional[list[dict]] = None) -> str: """ Retrieve the chat template string used for tokenizing chat messages. This template is used diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py index dd1aae486d13..58610a1cc2a3 100644 --- a/tests/tokenization/test_tokenization_utils.py +++ b/tests/tokenization/test_tokenization_utils.py @@ -24,6 +24,7 @@ import numpy as np from transformers import ( + AutoTokenizer, BatchEncoding, BertTokenizer, BertTokenizerFast, @@ -375,3 +376,25 @@ def test_training_new_tokenizer_edge_cases(self): tokenizer = PreTrainedTokenizerFast(tokenizer_object=_tokenizer) toy_text_iterator = ("a" for _ in range(1000)) tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50) + + +class ChatTemplateTest(unittest.TestCase): + def test_encode_message(self): + tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") + conversation = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hey there, how are you?"}, + {"role": "assistant", "content": "Thank you for asking, I am doing well"}, + {"role": "user", "content": "What's the weather like today?"}, + {"role": "assistant", "content": "Today the weather is nice"}, + ] + + # First, test the default case, where we encode the whole conversation at once + whole_conversation_tokens = tokenizer.apply_chat_template(conversation, tokenize=True) + + # Now, test the message-by-message encoding + tokens = [] + for i, message in enumerate(conversation): + tokens += tokenizer.encode_message(message, conversation_history=conversation[:i]) + + self.assertEqual(whole_conversation_tokens, tokens) From c0c65ba56b98cfca4e394180bba4582add374f7f Mon Sep 17 00:00:00 2001 From: pco111 <15262555+pco111@user.noreply.gitee.com> Date: Mon, 21 Jul 2025 12:39:03 -0400 Subject: [PATCH 02/10] Fix the `encode_message` method, remove the `add_generation_prompt` parameter and add the corresponding error handling. Update the document to reflect this change and verify the error handling in the test. --- src/transformers/tokenization_utils_base.py | 20 ++++++++++--------- tests/tokenization/test_tokenization_utils.py | 9 +++++++++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 996255edd729..d72a9def5955 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1736,7 +1736,6 @@ def encode_message( self, message: dict[str, str], conversation_history: Optional[list[dict[str, str]]] = None, - add_generation_prompt: bool = False, **kwargs, ) -> list[int]: """ @@ -1748,30 +1747,33 @@ def encode_message( is added to the entire chat, it will be impossible to distinguish a chat-start-token from a message-start-token. In these cases, this method will do its best to find the correct tokenization, but it may not be perfect. + **Note:** This method does not support `add_generation_prompt`. If you want to add a generation prompt, + you should do it separately after tokenizing the conversation. + Args: message (`dict`): A dictionary with "role" and "content" keys, representing the message to tokenize. conversation_history (`list[dict]`, *optional*): A list of dicts with "role" and "content" keys, representing the chat history so far. If you are tokenizing messages one by one, you should pass the previous messages in the conversation here. - add_generation_prompt (bool, *optional*): - If this is set, a prompt with the token(s) that indicate the start of an assistant message will be - appended to the formatted output. This is useful when you want to generate a response from the model. - Note that this argument will be passed to the chat template, and so it must be supported in the - template for this argument to have any effect. **kwargs: Additional kwargs to pass to the `apply_chat_template` method. Returns: `list[int]`: A list of token ids representing the tokenized message. """ + if "add_generation_prompt" in kwargs: + raise ValueError( + "`encode_message` does not support `add_generation_prompt`. Please add the generation prompt " + "separately." + ) if conversation_history is None: conversation_history = [] - + return self._encode_message( message=message, conversation_history=conversation_history, - add_generation_prompt=add_generation_prompt, + add_generation_prompt=False, **kwargs, ) @@ -3333,7 +3335,7 @@ def pad( pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. - This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). padding_side (`str`, *optional*): The side on which the model should have padding applied. Should be selected between ['right', 'left']. diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py index 58610a1cc2a3..12a79cb4d825 100644 --- a/tests/tokenization/test_tokenization_utils.py +++ b/tests/tokenization/test_tokenization_utils.py @@ -398,3 +398,12 @@ def test_encode_message(self): tokens += tokenizer.encode_message(message, conversation_history=conversation[:i]) self.assertEqual(whole_conversation_tokens, tokens) + + def test_encode_message_raises_on_add_generation_prompt(self): + tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") + conversation = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hey there, how are you?"}, + ] + with self.assertRaises(ValueError): + tokenizer.encode_message(conversation[0], add_generation_prompt=True) From 423287beab1fed89d60382aa006c884abd7be7ac Mon Sep 17 00:00:00 2001 From: pco111 <15262555+pco111@user.noreply.gitee.com> Date: Tue, 22 Jul 2025 14:25:57 -0400 Subject: [PATCH 03/10] Optimize the `encode_message` method, improve the processing logic of the empty dialogue history, and ensure that the chat template can be applied correctly when the dialogue history is empty. Update the document to reflect these changes. --- src/transformers/tokenization_utils_base.py | 35 ++++++++++++------- tests/tokenization/test_tokenization_utils.py | 2 -- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index d72a9def5955..8baf19c32dcc 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1741,15 +1741,12 @@ def encode_message( """ Tokenize a single message. This method is a convenience wrapper around `apply_chat_template` that allows you to tokenize messages one by one. This is useful for things like token-by-token streaming. - This method is not guaranteed to be perfect. For some models, it may be impossible to robustly tokenize single messages. For example, if the chat template adds tokens after each message, but also has a prefix that is added to the entire chat, it will be impossible to distinguish a chat-start-token from a message-start-token. In these cases, this method will do its best to find the correct tokenization, but it may not be perfect. - **Note:** This method does not support `add_generation_prompt`. If you want to add a generation prompt, you should do it separately after tokenizing the conversation. - Args: message (`dict`): A dictionary with "role" and "content" keys, representing the message to tokenize. @@ -1758,7 +1755,6 @@ def encode_message( tokenizing messages one by one, you should pass the previous messages in the conversation here. **kwargs: Additional kwargs to pass to the `apply_chat_template` method. - Returns: `list[int]`: A list of token ids representing the tokenized message. """ @@ -1767,15 +1763,30 @@ def encode_message( "`encode_message` does not support `add_generation_prompt`. Please add the generation prompt " "separately." ) - if conversation_history is None: - conversation_history = [] - - return self._encode_message( - message=message, - conversation_history=conversation_history, - add_generation_prompt=False, - **kwargs, + + if conversation_history is None or len(conversation_history) == 0: + return self.apply_chat_template([message], add_generation_prompt=False, tokenize=True, **kwargs) + + conversation = conversation_history + [message] + tokens = self.apply_chat_template(conversation, add_generation_prompt=False, tokenize=True, **kwargs) + + prefix_tokens = self.apply_chat_template( + conversation_history, add_generation_prompt=False, tokenize=True, **kwargs ) + # It's possible that the prefix tokens are not a prefix of the full list of tokens. + # For example, if the prefix is `User: Hi` and the full conversation is `User: HiAssistant: Hello`. + # In this case, we can't simply find the prefix, so we have to do something a bit more subtle. + # We look for the first place where the tokens differ, and that's our split point. + # This is not perfect, but it's the best we can do without a token-level API. + # To make this more robust, we could do a diff and find the longest common subsequence, but this is + # a good first approximation. + # This is particularly important for models like Llama3 that have changed their chat template to include + # EOS tokens after user messages. + min_len = min(len(prefix_tokens), len(tokens)) + for i in range(min_len): + if prefix_tokens[i] != tokens[i]: + return tokens[i:] + return tokens[min_len:] def get_chat_template(self, chat_template: Optional[str] = None, tools: Optional[list[dict]] = None) -> str: """ diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py index 12a79cb4d825..8ba755fa4e91 100644 --- a/tests/tokenization/test_tokenization_utils.py +++ b/tests/tokenization/test_tokenization_utils.py @@ -377,8 +377,6 @@ def test_training_new_tokenizer_edge_cases(self): toy_text_iterator = ("a" for _ in range(1000)) tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50) - -class ChatTemplateTest(unittest.TestCase): def test_encode_message(self): tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") conversation = [ From 7ab08034153c006132a8d62380c6480d37de1ba1 Mon Sep 17 00:00:00 2001 From: pco111 <15262555+pco111@user.noreply.gitee.com> Date: Tue, 22 Jul 2025 14:37:58 -0400 Subject: [PATCH 04/10] The `_encode_message` method is deleted, the message coding logic is simplified, and the functional integrity of the `encode_message` method is ensured. Update the document to reflect these changes. --- src/transformers/tokenization_utils_base.py | 38 --------------------- 1 file changed, 38 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 8baf19c32dcc..afa51dd12b6f 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1694,44 +1694,6 @@ def apply_chat_template( else: return rendered_chat - def _encode_message( - self, - message: dict[str, str], - conversation_history: list[dict[str, str]], - add_generation_prompt: bool, - **kwargs, - ) -> list[int]: - """ - Helper function to encode a single message. - """ - conversation = conversation_history + [message] - tokens = self.apply_chat_template( - conversation, add_generation_prompt=add_generation_prompt, tokenize=True, **kwargs - ) - - # Now we need to find the beginning of the last message in the token stream. - # We do this by tokenizing the conversation *without* the last message and seeing where the token streams differ. - if len(conversation_history) > 0: - prefix_tokens = self.apply_chat_template( - conversation_history, add_generation_prompt=add_generation_prompt, tokenize=True, **kwargs - ) - # It's possible that the prefix tokens are not a prefix of the full list of tokens. - # For example, if the prefix is `User: Hi` and the full conversation is `User: HiAssistant: Hello`. - # In this case, we can't simply find the prefix, so we have to do something a bit more subtle. - # We look for the first place where the tokens differ, and that's our split point. - # This is not perfect, but it's the best we can do without a token-level API. - # To make this more robust, we could do a diff and find the longest common subsequence, but this is - # a good first approximation. - # This is particularly important for models like Llama3 that have changed their chat template to include - # EOS tokens after user messages. - min_len = min(len(prefix_tokens), len(tokens)) - for i in range(min_len): - if prefix_tokens[i] != tokens[i]: - return tokens[i:] - return tokens[min_len:] - else: - return tokens - def encode_message( self, message: dict[str, str], From b7e57b9ccb50d5cfa5afef3a4f9abd28e476a782 Mon Sep 17 00:00:00 2001 From: pco111 <15262555+pco111@user.noreply.gitee.com> Date: Tue, 22 Jul 2025 14:57:11 -0400 Subject: [PATCH 05/10] Docs fix --- src/transformers/tokenization_utils_base.py | 46 ++++++--------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index afa51dd12b6f..4857d4432b71 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -3267,63 +3267,43 @@ def pad( verbose: bool = True, ) -> BatchEncoding: """ - Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length - in the batch. - - Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`, - `self.pad_token_id` and `self.pad_token_type_id`). - - Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the - text followed by a call to the `pad` method to get a padded encoding. - - - - If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the - result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of - PyTorch tensors, you will lose the specific device of your tensors however. - - + Pad a single encoded input or a batch of encoded inputs up to the maximum length of the batch or up to a + given maximum length. Padding side can be specified on the left or on the right. Args: - encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `dict[str, list[int]]`, `dict[str, list[list[int]]` or `list[dict[str, list[int]]]`): - Tokenized inputs. Can represent one input ([`BatchEncoding`] or `dict[str, list[int]]`) or a batch of - tokenized inputs (list of [`BatchEncoding`], *dict[str, list[list[int]]]* or *list[dict[str, - list[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader - collate function. - - Instead of `list[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see - the note above for the return type. + encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `dict[str, list[int]]`, `dict[str, list[list[int]]]` or `list[dict[str, list[int]]]`): + Tokenized inputs. Can be a single batch encoding, a list of batch encodings, a dictionary of entries + produced by a `tokenizer.encode_plus` or a list of dictionaries from a `tokenizer.batch_encode_plus`. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: - - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). max_length (`int`, *optional*): Maximum length of the returned list and optionally padding length (see above). pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. - This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). padding_side (`str`, *optional*): - The side on which the model should have padding applied. Should be selected between ['right', 'left']. - Default value is picked from the class attribute of the same name. + 'right' or 'left'. If get_ येणार from the tokenizer, the `tokenizer.padding_side` will be used. return_attention_mask (`bool`, *optional*): Whether to return the attention mask. If left to the default, will return the attention mask according - to the specific tokenizer's default, defined by the `return_outputs` attribute. + to the specific tokenizer's default. [What are attention masks?](../glossary#attention-mask) return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors instead of list of python integers. Acceptable values are: - - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'tf'`: Return TensorFlow `tf.Tensor` objects. - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return Numpy `np.ndarray` objects. + - `'np'`: Return NumPy `np.ndarray` objects. verbose (`bool`, *optional*, defaults to `True`): Whether or not to print more information and warnings. """ @@ -3761,7 +3741,7 @@ def _pad( - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. - This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). padding_side: The side on which the model should have padding applied. Should be selected between ['right', 'left']. From f39d67a104ed443fc448a39f552c8fbfcb02bfa8 Mon Sep 17 00:00:00 2001 From: pco111 <15262555+pco111@user.noreply.gitee.com> Date: Wed, 23 Jul 2025 12:11:21 -0400 Subject: [PATCH 06/10] Revert changes in docstring of pad() --- src/transformers/tokenization_utils_base.py | 71 +++++++++++++++------ 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 4857d4432b71..a53050b40047 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -3724,30 +3724,65 @@ def _pad( return_attention_mask: Optional[bool] = None, ) -> dict: """ - Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length + in the batch. + + Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`, + `self.pad_token_id` and `self.pad_token_type_id`). + + Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the + text followed by a call to the `pad` method to get a padded encoding. + + + + If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the + result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of + PyTorch tensors, you will lose the specific device of your tensors however. + + Args: - encoded_inputs: - Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`). - max_length: maximum length of the returned list and optionally padding length (see below). - Will truncate by taking into account the special tokens. - padding_strategy: PaddingStrategy to use for padding. - - - PaddingStrategy.LONGEST Pad to the longest sequence in the batch - - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) - - PaddingStrategy.DO_NOT_PAD: Do not pad - The tokenizer padding sides are defined in `padding_side` argument: - - - 'left': pads on the left of the sequences - - 'right': pads on the right of the sequences - pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `dict[str, list[int]]`, `dict[str, list[list[int]]` or `list[dict[str, list[int]]]`): + Tokenized inputs. Can represent one input ([`BatchEncoding`] or `dict[str, list[int]]`) or a batch of + tokenized inputs (list of [`BatchEncoding`], *dict[str, list[list[int]]]* or *list[dict[str, + list[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader + collate function. + + Instead of `list[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see + the note above for the return type. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different + lengths). + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). - padding_side: + padding_side (`str`, *optional*): The side on which the model should have padding applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. - return_attention_mask: - (optional) Set to False to avoid returning attention mask (default: set to model specifics) + return_attention_mask (`bool`, *optional*): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are attention masks?](../glossary#attention-mask) + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + verbose (`bool`, *optional*, defaults to `True`): + Whether or not to print more information and warnings. """ # Load from model defaults if return_attention_mask is None: From f14a3ee3199770f9393be2388495a1fb33d146a2 Mon Sep 17 00:00:00 2001 From: pco111 <15262555+pco111@user.noreply.gitee.com> Date: Wed, 23 Jul 2025 12:21:26 -0400 Subject: [PATCH 07/10] Revert changes in docstring --- src/transformers/tokenization_utils_base.py | 115 +++++++++----------- 1 file changed, 50 insertions(+), 65 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index a53050b40047..f717a0cf753e 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -3267,22 +3267,41 @@ def pad( verbose: bool = True, ) -> BatchEncoding: """ - Pad a single encoded input or a batch of encoded inputs up to the maximum length of the batch or up to a - given maximum length. Padding side can be specified on the left or on the right. + Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length + in the batch. + + Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`, + `self.pad_token_id` and `self.pad_token_type_id`). + + Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the + text followed by a call to the `pad` method to get a padded encoding. + + + + If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the + result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of + PyTorch tensors, you will lose the specific device of your tensors however. + + Args: - encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `dict[str, list[int]]`, `dict[str, list[list[int]]]` or `list[dict[str, list[int]]]`): - Tokenized inputs. Can be a single batch encoding, a list of batch encodings, a dictionary of entries - produced by a `tokenizer.encode_plus` or a list of dictionaries from a `tokenizer.batch_encode_plus`. + encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `dict[str, list[int]]`, `dict[str, list[list[int]]` or `list[dict[str, list[int]]]`): + Tokenized inputs. Can represent one input ([`BatchEncoding`] or `dict[str, list[int]]`) or a batch of + tokenized inputs (list of [`BatchEncoding`], *dict[str, list[list[int]]]* or *list[dict[str, + list[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader + collate function. + + Instead of `list[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see + the note above for the return type. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: - - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths). max_length (`int`, *optional*): Maximum length of the returned list and optionally padding length (see above). @@ -3292,18 +3311,19 @@ def pad( This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). padding_side (`str`, *optional*): - 'right' or 'left'. If get_ येणार from the tokenizer, the `tokenizer.padding_side` will be used. + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask (`bool`, *optional*): Whether to return the attention mask. If left to the default, will return the attention mask according - to the specific tokenizer's default. + to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention masks?](../glossary#attention-mask) return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors instead of list of python integers. Acceptable values are: - - `'tf'`: Return TensorFlow `tf.Tensor` objects. + - `'tf'`: Return TensorFlow `tf.constant` objects. - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. + - `'np'`: Return Numpy `np.ndarray` objects. verbose (`bool`, *optional*, defaults to `True`): Whether or not to print more information and warnings. """ @@ -3724,65 +3744,30 @@ def _pad( return_attention_mask: Optional[bool] = None, ) -> dict: """ - Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length - in the batch. - - Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`, - `self.pad_token_id` and `self.pad_token_type_id`). - - Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the - text followed by a call to the `pad` method to get a padded encoding. - - - - If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the - result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of - PyTorch tensors, you will lose the specific device of your tensors however. - - + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) Args: - encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `dict[str, list[int]]`, `dict[str, list[list[int]]` or `list[dict[str, list[int]]]`): - Tokenized inputs. Can represent one input ([`BatchEncoding`] or `dict[str, list[int]]`) or a batch of - tokenized inputs (list of [`BatchEncoding`], *dict[str, list[list[int]]]* or *list[dict[str, - list[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader - collate function. - - Instead of `list[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see - the note above for the return type. - padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): - Select a strategy to pad the returned sequences (according to the model's padding side and padding - index) among: - - - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single - sequence if provided). - - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum - acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different - lengths). - max_length (`int`, *optional*): - Maximum length of the returned list and optionally padding length (see above). - pad_to_multiple_of (`int`, *optional*): - If set will pad the sequence to a multiple of the provided value. - - This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + encoded_inputs: + Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in `padding_side` argument: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). - padding_side (`str`, *optional*): + padding_side: The side on which the model should have padding applied. Should be selected between ['right', 'left']. Default value is picked from the class attribute of the same name. - return_attention_mask (`bool`, *optional*): - Whether to return the attention mask. If left to the default, will return the attention mask according - to the specific tokenizer's default, defined by the `return_outputs` attribute. - - [What are attention masks?](../glossary#attention-mask) - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors instead of list of python integers. Acceptable values are: - - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return Numpy `np.ndarray` objects. - verbose (`bool`, *optional*, defaults to `True`): - Whether or not to print more information and warnings. + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ # Load from model defaults if return_attention_mask is None: From 4d59bc5b036a930e8b728af0c69648ce0a047e79 Mon Sep 17 00:00:00 2001 From: Jeff Zhang <56655972+pco111@users.noreply.github.com> Date: Tue, 29 Jul 2025 13:33:25 -0400 Subject: [PATCH 08/10] Update src/transformers/tokenization_utils_base.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index f717a0cf753e..881f7770f7a2 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1694,7 +1694,7 @@ def apply_chat_template( else: return rendered_chat - def encode_message( + def encode_message_with_chat_template( self, message: dict[str, str], conversation_history: Optional[list[dict[str, str]]] = None, From d70fbec9a3e28737f165112648e8ee69f44c0460 Mon Sep 17 00:00:00 2001 From: pco111 <15262555+pco111@user.noreply.gitee.com> Date: Tue, 29 Jul 2025 14:45:50 -0400 Subject: [PATCH 09/10] Repair the call of the `encode_message` method, update it to `encode_message_with_chat_template` to support the chat template, and adjust the relevant test cases to reflect this change. --- src/transformers/tokenization_utils_base.py | 6 ++++-- tests/tokenization/test_tokenization_utils.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 881f7770f7a2..25b965242919 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1722,7 +1722,7 @@ def encode_message_with_chat_template( """ if "add_generation_prompt" in kwargs: raise ValueError( - "`encode_message` does not support `add_generation_prompt`. Please add the generation prompt " + "`encode_message_with_chat_template` does not support `add_generation_prompt`. Please add the generation prompt " "separately." ) @@ -1730,7 +1730,9 @@ def encode_message_with_chat_template( return self.apply_chat_template([message], add_generation_prompt=False, tokenize=True, **kwargs) conversation = conversation_history + [message] - tokens = self.apply_chat_template(conversation, add_generation_prompt=False, tokenize=True, **kwargs) + tokens = self.apply_chat_template( + conversation, add_generation_prompt=False, tokenize=True, **kwargs + ) prefix_tokens = self.apply_chat_template( conversation_history, add_generation_prompt=False, tokenize=True, **kwargs diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py index 8ba755fa4e91..fc74223110f8 100644 --- a/tests/tokenization/test_tokenization_utils.py +++ b/tests/tokenization/test_tokenization_utils.py @@ -393,7 +393,7 @@ def test_encode_message(self): # Now, test the message-by-message encoding tokens = [] for i, message in enumerate(conversation): - tokens += tokenizer.encode_message(message, conversation_history=conversation[:i]) + tokens += tokenizer.encode_message_with_chat_template(message, conversation_history=conversation[:i]) self.assertEqual(whole_conversation_tokens, tokens) @@ -404,4 +404,4 @@ def test_encode_message_raises_on_add_generation_prompt(self): {"role": "user", "content": "Hey there, how are you?"}, ] with self.assertRaises(ValueError): - tokenizer.encode_message(conversation[0], add_generation_prompt=True) + tokenizer.encode_message_with_chat_template(conversation[0], add_generation_prompt=True) From e859b5298cf67eebde58f89df40b693be3b62a47 Mon Sep 17 00:00:00 2001 From: pco111 <15262555+pco111@user.noreply.gitee.com> Date: Tue, 29 Jul 2025 14:52:58 -0400 Subject: [PATCH 10/10] Optimize the call format of the `apply_chat_template` method, and merge multi-line calls into a single line to improve code readability. --- src/transformers/tokenization_utils_base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 25b965242919..1170217742c3 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1730,9 +1730,7 @@ def encode_message_with_chat_template( return self.apply_chat_template([message], add_generation_prompt=False, tokenize=True, **kwargs) conversation = conversation_history + [message] - tokens = self.apply_chat_template( - conversation, add_generation_prompt=False, tokenize=True, **kwargs - ) + tokens = self.apply_chat_template(conversation, add_generation_prompt=False, tokenize=True, **kwargs) prefix_tokens = self.apply_chat_template( conversation_history, add_generation_prompt=False, tokenize=True, **kwargs