add warning to let the user know that the __call__ method is faster than encode + pad for a fast tokenizer (#18693)

* add warning to let the user know that the  method is slower that  for a fast tokenizer

* user warnings

* fix layoutlmv2

* fix layout*

* change warnings into logger.warning
This commit is contained in:
SaulLu
2022-08-24 12:27:56 +02:00
committed by GitHub
parent dcff504e18
commit 6667b0d7bf
5 changed files with 217 additions and 4 deletions

View File

@@ -2821,7 +2821,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
in the batch.
Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
`self.pad_token_id` and `self.pad_token_type_id`)
`self.pad_token_id` and `self.pad_token_type_id`).
Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the
text followed by a call to the `pad` method to get a padded encoding.
<Tip>
@@ -2871,6 +2874,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
verbose (`bool`, *optional*, defaults to `True`):
Whether or not to print more information and warnings.
"""
if self.__class__.__name__.endswith("Fast"):
if not self.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False):
logger.warning_advice(
f"You're using a {self.__class__.__name__} tokenizer. Please note that with a fast tokenizer,"
" using the `__call__` method is faster than using a method to encode the text followed by a call"
" to the `pad` method to get a padded encoding."
)
self.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
# If we have a list of dicts, let's convert it in a dict of lists
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):