add warning to let the user know that the __call__ method is faster than encode + pad for a fast tokenizer (#18693)
* add warning to let the user know that the method is slower that for a fast tokenizer * user warnings * fix layoutlmv2 * fix layout* * change warnings into logger.warning
This commit is contained in:
@@ -21,7 +21,14 @@ import tempfile
|
||||
import unittest
|
||||
from typing import List
|
||||
|
||||
from transformers import AddedToken, LayoutLMv2TokenizerFast, SpecialTokensMixin, is_tf_available, is_torch_available
|
||||
from transformers import (
|
||||
AddedToken,
|
||||
LayoutLMv2TokenizerFast,
|
||||
SpecialTokensMixin,
|
||||
is_tf_available,
|
||||
is_torch_available,
|
||||
logging,
|
||||
)
|
||||
from transformers.models.layoutlmv2.tokenization_layoutlmv2 import (
|
||||
VOCAB_FILES_NAMES,
|
||||
BasicTokenizer,
|
||||
@@ -41,6 +48,9 @@ from ...test_tokenization_common import (
|
||||
)
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@require_tokenizers
|
||||
@require_pandas
|
||||
class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
@@ -788,6 +798,49 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
|
||||
def test_padding_warning_message_fast_tokenizer(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
words, boxes = self.get_words_and_boxes_batch()
|
||||
|
||||
tokenizer_fast = self.get_rust_tokenizer()
|
||||
|
||||
encoding_fast = tokenizer_fast(
|
||||
words,
|
||||
boxes=boxes,
|
||||
)
|
||||
|
||||
with self.assertLogs("transformers", level="WARNING") as cm:
|
||||
tokenizer_fast.pad(encoding_fast)
|
||||
self.assertEqual(len(cm.records), 1)
|
||||
self.assertIn(
|
||||
"Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to"
|
||||
" encode the text followed by a call to the `pad` method to get a padded encoding.",
|
||||
cm.records[0].message,
|
||||
)
|
||||
|
||||
if not self.test_slow_tokenizer:
|
||||
return
|
||||
|
||||
tokenizer_slow = self.get_tokenizer()
|
||||
|
||||
encoding_slow = tokenizer_slow(
|
||||
words,
|
||||
boxes=boxes,
|
||||
)
|
||||
|
||||
with self.assertLogs(level="WARNING") as cm:
|
||||
# We want to assert there are no warnings, but the 'assertLogs' method does not support that.
|
||||
# Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
|
||||
logger.warning("Dummy warning")
|
||||
tokenizer_slow.pad(encoding_slow)
|
||||
self.assertEqual(len(cm.records), 1)
|
||||
self.assertIn(
|
||||
"Dummy warning",
|
||||
cm.records[0].message,
|
||||
)
|
||||
|
||||
def test_call(self):
|
||||
# Tests that all call wrap to encode_plus and batch_encode_plus
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
|
||||
@@ -22,13 +22,23 @@ import tempfile
|
||||
import unittest
|
||||
from typing import List
|
||||
|
||||
from transformers import AddedToken, LayoutLMv3TokenizerFast, SpecialTokensMixin, is_tf_available, is_torch_available
|
||||
from transformers import (
|
||||
AddedToken,
|
||||
LayoutLMv3TokenizerFast,
|
||||
SpecialTokensMixin,
|
||||
is_tf_available,
|
||||
is_torch_available,
|
||||
logging,
|
||||
)
|
||||
from transformers.models.layoutlmv3.tokenization_layoutlmv3 import VOCAB_FILES_NAMES, LayoutLMv3Tokenizer
|
||||
from transformers.testing_utils import is_pt_tf_cross_test, require_pandas, require_tokenizers, require_torch, slow
|
||||
|
||||
from ...test_tokenization_common import SMALL_TRAINING_CORPUS, TokenizerTesterMixin, merge_model_tokenizer_mappings
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@require_tokenizers
|
||||
@require_pandas
|
||||
class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
@@ -668,6 +678,49 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
|
||||
def test_padding_warning_message_fast_tokenizer(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
words, boxes = self.get_words_and_boxes_batch()
|
||||
|
||||
tokenizer_fast = self.get_rust_tokenizer()
|
||||
|
||||
encoding_fast = tokenizer_fast(
|
||||
words,
|
||||
boxes=boxes,
|
||||
)
|
||||
|
||||
with self.assertLogs("transformers", level="WARNING") as cm:
|
||||
tokenizer_fast.pad(encoding_fast)
|
||||
self.assertEqual(len(cm.records), 1)
|
||||
self.assertIn(
|
||||
"Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to"
|
||||
" encode the text followed by a call to the `pad` method to get a padded encoding.",
|
||||
cm.records[0].message,
|
||||
)
|
||||
|
||||
if not self.test_slow_tokenizer:
|
||||
return
|
||||
|
||||
tokenizer_slow = self.get_tokenizer()
|
||||
|
||||
encoding_slow = tokenizer_slow(
|
||||
words,
|
||||
boxes=boxes,
|
||||
)
|
||||
|
||||
with self.assertLogs(level="WARNING") as cm:
|
||||
# We want to assert there are no warnings, but the 'assertLogs' method does not support that.
|
||||
# Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
|
||||
logger.warning("Dummy warning")
|
||||
tokenizer_slow.pad(encoding_slow)
|
||||
self.assertEqual(len(cm.records), 1)
|
||||
self.assertIn(
|
||||
"Dummy warning",
|
||||
cm.records[0].message,
|
||||
)
|
||||
|
||||
def test_call(self):
|
||||
# Tests that all call wrap to encode_plus and batch_encode_plus
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
|
||||
@@ -19,7 +19,14 @@ import tempfile
|
||||
import unittest
|
||||
from typing import List
|
||||
|
||||
from transformers import AddedToken, LayoutXLMTokenizerFast, SpecialTokensMixin, is_tf_available, is_torch_available
|
||||
from transformers import (
|
||||
AddedToken,
|
||||
LayoutXLMTokenizerFast,
|
||||
SpecialTokensMixin,
|
||||
is_tf_available,
|
||||
is_torch_available,
|
||||
logging,
|
||||
)
|
||||
from transformers.models.layoutxlm.tokenization_layoutxlm import LayoutXLMTokenizer
|
||||
from transformers.testing_utils import (
|
||||
get_tests_dir,
|
||||
@@ -40,6 +47,7 @@ from ...test_tokenization_common import (
|
||||
)
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
|
||||
|
||||
|
||||
@@ -697,6 +705,49 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
|
||||
def test_padding_warning_message_fast_tokenizer(self):
|
||||
if not self.test_rust_tokenizer:
|
||||
return
|
||||
|
||||
words, boxes = self.get_words_and_boxes_batch()
|
||||
|
||||
tokenizer_fast = self.get_rust_tokenizer()
|
||||
|
||||
encoding_fast = tokenizer_fast(
|
||||
words,
|
||||
boxes=boxes,
|
||||
)
|
||||
|
||||
with self.assertLogs("transformers", level="WARNING") as cm:
|
||||
tokenizer_fast.pad(encoding_fast)
|
||||
self.assertEqual(len(cm.records), 1)
|
||||
self.assertIn(
|
||||
"Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to"
|
||||
" encode the text followed by a call to the `pad` method to get a padded encoding.",
|
||||
cm.records[0].message,
|
||||
)
|
||||
|
||||
if not self.test_slow_tokenizer:
|
||||
return
|
||||
|
||||
tokenizer_slow = self.get_tokenizer()
|
||||
|
||||
encoding_slow = tokenizer_slow(
|
||||
words,
|
||||
boxes=boxes,
|
||||
)
|
||||
|
||||
with self.assertLogs(level="WARNING") as cm:
|
||||
# We want to assert there are no warnings, but the 'assertLogs' method does not support that.
|
||||
# Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
|
||||
logger.warning("Dummy warning")
|
||||
tokenizer_slow.pad(encoding_slow)
|
||||
self.assertEqual(len(cm.records), 1)
|
||||
self.assertIn(
|
||||
"Dummy warning",
|
||||
cm.records[0].message,
|
||||
)
|
||||
|
||||
def test_call(self):
|
||||
# Tests that all call wrap to encode_plus and batch_encode_plus
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
|
||||
Reference in New Issue
Block a user