add warning to let the user know that the __call__ method is faster than encode + pad for a fast tokenizer (#18693)
* add warning to let the user know that the method is slower that for a fast tokenizer * user warnings * fix layoutlmv2 * fix layout* * change warnings into logger.warning
This commit is contained in:
@@ -48,6 +48,7 @@ from transformers import (
|
||||
is_tf_available,
|
||||
is_tokenizers_available,
|
||||
is_torch_available,
|
||||
logging,
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
TOKEN,
|
||||
@@ -81,6 +82,8 @@ if is_tokenizers_available():
|
||||
from test_module.custom_tokenization_fast import CustomTokenizerFast
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
|
||||
|
||||
SMALL_TRAINING_CORPUS = [
|
||||
@@ -1834,6 +1837,47 @@ class TokenizerTesterMixin:
|
||||
self.assertEqual(attention_mask + [0] * padding_size, right_padded_attention_mask)
|
||||
self.assertEqual([0] * padding_size + attention_mask, left_padded_attention_mask)
|
||||
|
||||
def test_padding_warning_message_fast_tokenizer(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
sequence = "This is a text"
|
||||
|
||||
tokenizer_fast = self.get_rust_tokenizer()
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer_fast, sequence)
|
||||
|
||||
encoding_fast = tokenizer_fast(sequence)
|
||||
|
||||
with self.assertLogs("transformers", level="WARNING") as cm:
|
||||
tokenizer_fast.pad(encoding_fast)
|
||||
self.assertEqual(len(cm.records), 1)
|
||||
self.assertIn(
|
||||
"Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to"
|
||||
" encode the text followed by a call to the `pad` method to get a padded encoding.",
|
||||
cm.records[0].message,
|
||||
)
|
||||
|
||||
if not self.test_slow_tokenizer:
|
||||
return
|
||||
|
||||
tokenizer_slow = self.get_tokenizer()
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer_slow, sequence)
|
||||
|
||||
encoding_slow = tokenizer_slow(sequence)
|
||||
|
||||
with self.assertLogs(level="WARNING") as cm:
|
||||
# We want to assert there are no warnings, but the 'assertLogs' method does not support that.
|
||||
# Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
|
||||
logger.warning("Dummy warning")
|
||||
tokenizer_slow.pad(encoding_slow)
|
||||
self.assertEqual(len(cm.records), 1)
|
||||
self.assertIn(
|
||||
"Dummy warning",
|
||||
cm.records[0].message,
|
||||
)
|
||||
|
||||
def test_separate_tokenizers(self):
|
||||
# This tests that tokenizers don't impact others. Unfortunately the case where it fails is when
|
||||
# we're loading an S3 configuration from a pre-trained identifier, and we have no way of testing those today.
|
||||
|
||||
Reference in New Issue
Block a user