add warning to let the user know that the __call__ method is faster than encode + pad for a fast tokenizer (#18693)

* add warning to let the user know that the method is slower that for a fast tokenizer * user warnings * fix layoutlmv2 * fix layout* * change warnings into logger.warning
2022-08-24 12:27:56 +02:00
parent dcff504e18
commit 6667b0d7bf
5 changed files with 217 additions and 4 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2821,7 +2821,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        in the batch.

        Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
-        `self.pad_token_id` and `self.pad_token_type_id`)
+        `self.pad_token_id` and `self.pad_token_type_id`).
+
+        Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the
+        text followed by a call to the `pad` method to get a padded encoding.

        <Tip>

@@ -2871,6 +2874,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
            verbose (`bool`, *optional*, defaults to `True`):
                Whether or not to print more information and warnings.
        """
+        if self.__class__.__name__.endswith("Fast"):
+            if not self.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False):
+                logger.warning_advice(
+                    f"You're using a {self.__class__.__name__} tokenizer. Please note that with a fast tokenizer,"
+                    " using the `__call__` method is faster than using a method to encode the text followed by a call"
+                    " to the `pad` method to get a padded encoding."
+                )
+                self.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
+
        # If we have a list of dicts, let's convert it in a dict of lists
        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):