@@ -3268,22 +3268,41 @@ def pad(
32683268 verbose : bool = True ,
32693269 ) -> BatchEncoding :
32703270 """
3271- Pad a single encoded input or a batch of encoded inputs up to the maximum length of the batch or up to a
3272- given maximum length. Padding side can be specified on the left or on the right.
3271+ Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
3272+ in the batch.
3273+
3274+ Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
3275+ `self.pad_token_id` and `self.pad_token_type_id`).
3276+
3277+ Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the
3278+ text followed by a call to the `pad` method to get a padded encoding.
3279+
3280+ <Tip>
3281+
3282+ If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
3283+ result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
3284+ PyTorch tensors, you will lose the specific device of your tensors however.
3285+
3286+ </Tip>
32733287
32743288 Args:
3275- encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `dict[str, list[int]]`, `dict[str, list[list[int]]]` or `list[dict[str, list[int]]]`):
3276- Tokenized inputs. Can be a single batch encoding, a list of batch encodings, a dictionary of entries
3277- produced by a `tokenizer.encode_plus` or a list of dictionaries from a `tokenizer.batch_encode_plus`.
3289+ encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `dict[str, list[int]]`, `dict[str, list[list[int]]` or `list[dict[str, list[int]]]`):
3290+ Tokenized inputs. Can represent one input ([`BatchEncoding`] or `dict[str, list[int]]`) or a batch of
3291+ tokenized inputs (list of [`BatchEncoding`], *dict[str, list[list[int]]]* or *list[dict[str,
3292+ list[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
3293+ collate function.
3294+
3295+ Instead of `list[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see
3296+ the note above for the return type.
32783297 padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
32793298 Select a strategy to pad the returned sequences (according to the model's padding side and padding
32803299 index) among:
32813300
3282- - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
3301+ - `True` or `'longest'` (default) : Pad to the longest sequence in the batch (or no padding if only a single
32833302 sequence if provided).
32843303 - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
32853304 acceptable input length for the model if that argument is not provided.
3286- - `False` or `'do_not_pad'` (default) : No padding (i.e., can output a batch with sequences of different
3305+ - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different
32873306 lengths).
32883307 max_length (`int`, *optional*):
32893308 Maximum length of the returned list and optionally padding length (see above).
@@ -3293,18 +3312,19 @@ def pad(
32933312 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
32943313 `>= 7.5` (Volta).
32953314 padding_side (`str`, *optional*):
3296- 'right' or 'left'. If get_ येणार from the tokenizer, the `tokenizer.padding_side` will be used.
3315+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
3316+ Default value is picked from the class attribute of the same name.
32973317 return_attention_mask (`bool`, *optional*):
32983318 Whether to return the attention mask. If left to the default, will return the attention mask according
3299- to the specific tokenizer's default.
3319+ to the specific tokenizer's default, defined by the `return_outputs` attribute .
33003320
33013321 [What are attention masks?](../glossary#attention-mask)
33023322 return_tensors (`str` or [`~utils.TensorType`], *optional*):
33033323 If set, will return tensors instead of list of python integers. Acceptable values are:
33043324
3305- - `'tf'`: Return TensorFlow `tf.Tensor ` objects.
3325+ - `'tf'`: Return TensorFlow `tf.constant ` objects.
33063326 - `'pt'`: Return PyTorch `torch.Tensor` objects.
3307- - `'np'`: Return NumPy `np.ndarray` objects.
3327+ - `'np'`: Return Numpy `np.ndarray` objects.
33083328 verbose (`bool`, *optional*, defaults to `True`):
33093329 Whether or not to print more information and warnings.
33103330 """
@@ -3725,65 +3745,30 @@ def _pad(
37253745 return_attention_mask : Optional [bool ] = None ,
37263746 ) -> dict :
37273747 """
3728- Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
3729- in the batch.
3730-
3731- Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
3732- `self.pad_token_id` and `self.pad_token_type_id`).
3733-
3734- Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the
3735- text followed by a call to the `pad` method to get a padded encoding.
3736-
3737- <Tip>
3738-
3739- If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
3740- result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
3741- PyTorch tensors, you will lose the specific device of your tensors however.
3742-
3743- </Tip>
3748+ Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
37443749
37453750 Args:
3746- encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `dict[str, list[int]]`, `dict[str, list[list[int]]` or `list[dict[str, list[int]]]`):
3747- Tokenized inputs. Can represent one input ([`BatchEncoding`] or `dict[str, list[int]]`) or a batch of
3748- tokenized inputs (list of [`BatchEncoding`], *dict[str, list[list[int]]]* or *list[dict[str,
3749- list[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
3750- collate function.
3751-
3752- Instead of `list[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see
3753- the note above for the return type.
3754- padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
3755- Select a strategy to pad the returned sequences (according to the model's padding side and padding
3756- index) among:
3757-
3758- - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
3759- sequence if provided).
3760- - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
3761- acceptable input length for the model if that argument is not provided.
3762- - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different
3763- lengths).
3764- max_length (`int`, *optional*):
3765- Maximum length of the returned list and optionally padding length (see above).
3766- pad_to_multiple_of (`int`, *optional*):
3767- If set will pad the sequence to a multiple of the provided value.
3768-
3769- This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
3751+ encoded_inputs:
3752+ Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
3753+ max_length: maximum length of the returned list and optionally padding length (see below).
3754+ Will truncate by taking into account the special tokens.
3755+ padding_strategy: PaddingStrategy to use for padding.
3756+
3757+ - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
3758+ - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
3759+ - PaddingStrategy.DO_NOT_PAD: Do not pad
3760+ The tokenizer padding sides are defined in `padding_side` argument:
3761+
3762+ - 'left': pads on the left of the sequences
3763+ - 'right': pads on the right of the sequences
3764+ pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
3765+ This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
37703766 `>= 7.5` (Volta).
3771- padding_side (`str`, *optional*) :
3767+ padding_side:
37723768 The side on which the model should have padding applied. Should be selected between ['right', 'left'].
37733769 Default value is picked from the class attribute of the same name.
3774- return_attention_mask (`bool`, *optional*):
3775- Whether to return the attention mask. If left to the default, will return the attention mask according
3776- to the specific tokenizer's default, defined by the `return_outputs` attribute.
3777-
3778- [What are attention masks?](../glossary#attention-mask)
3779- return_tensors (`str` or [`~utils.TensorType`], *optional*):
3780- If set, will return tensors instead of list of python integers. Acceptable values are:
3781-
3782- - `'tf'`: Return TensorFlow `tf.constant` objects.
3783- - `'pt'`: Return PyTorch `torch.Tensor` objects.
3784- - `'np'`: Return Numpy `np.ndarray` objects.
3785- verbose (`bool`, *optional*, defaults to `True`):
3786- Whether or not to print more information and warnings.
3770+ return_attention_mask:
3771+ (optional) Set to False to avoid returning attention mask (default: set to model specifics)
37873772 """
37883773 # Load from model defaults
37893774 if return_attention_mask is None :
0 commit comments