add warning to let the user know that the __call__ method is faster than encode + pad for a fast tokenizer (#18693)
* add warning to let the user know that the method is slower that for a fast tokenizer * user warnings * fix layoutlmv2 * fix layout* * change warnings into logger.warning
This commit is contained in:
@@ -2821,7 +2821,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
in the batch.
|
||||
|
||||
Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
|
||||
`self.pad_token_id` and `self.pad_token_type_id`)
|
||||
`self.pad_token_id` and `self.pad_token_type_id`).
|
||||
|
||||
Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the
|
||||
text followed by a call to the `pad` method to get a padded encoding.
|
||||
|
||||
<Tip>
|
||||
|
||||
@@ -2871,6 +2874,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
verbose (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to print more information and warnings.
|
||||
"""
|
||||
if self.__class__.__name__.endswith("Fast"):
|
||||
if not self.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False):
|
||||
logger.warning_advice(
|
||||
f"You're using a {self.__class__.__name__} tokenizer. Please note that with a fast tokenizer,"
|
||||
" using the `__call__` method is faster than using a method to encode the text followed by a call"
|
||||
" to the `pad` method to get a padded encoding."
|
||||
)
|
||||
self.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
|
||||
|
||||
# If we have a list of dicts, let's convert it in a dict of lists
|
||||
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
|
||||
if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
|
||||
|
||||
Reference in New Issue
Block a user