From 998a90bc7d8b3ecda4b1d0ef17e71afeb6729e2e Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Tue, 6 Sep 2022 14:51:03 +0200 Subject: [PATCH] Fix `test_tf_encode_plus_sent_to_model` for `LayoutLMv3` (#18898) Co-authored-by: ydshieh --- .../test_tokenization_layoutlmv3.py | 45 ++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index 419b95feee..322ed6861f 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -31,7 +31,14 @@ from transformers import ( logging, ) from transformers.models.layoutlmv3.tokenization_layoutlmv3 import VOCAB_FILES_NAMES, LayoutLMv3Tokenizer -from transformers.testing_utils import is_pt_tf_cross_test, require_pandas, require_tokenizers, require_torch, slow +from transformers.testing_utils import ( + is_pt_tf_cross_test, + require_pandas, + require_tf, + require_tokenizers, + require_torch, + slow, +) from ...test_tokenization_common import SMALL_TRAINING_CORPUS, TokenizerTesterMixin, merge_model_tokenizer_mappings @@ -2400,3 +2407,39 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): @unittest.skip("Doesn't support another framework than PyTorch") def test_np_encode_plus_sent_to_model(self): pass + + @require_tf + @slow + def test_tf_encode_plus_sent_to_model(self): + from transformers import TF_MODEL_MAPPING, TOKENIZER_MAPPING + + MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(TF_MODEL_MAPPING, TOKENIZER_MAPPING) + + tokenizers = self.get_tokenizers(do_lower_case=False) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: + return + + config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__] + config = config_class() + + if config.is_encoder_decoder or config.pad_token_id is None: + return + + model = model_class(config) + + # Make sure the model contains at least the full vocabulary size in its embedding matrix + self.assertGreaterEqual(model.config.vocab_size, len(tokenizer)) + + # Build sequence + first_ten_tokens = list(tokenizer.get_vocab().keys())[:10] + boxes = [[1000, 1000, 1000, 1000] for _ in range(len(first_ten_tokens))] + encoded_sequence = tokenizer.encode_plus(first_ten_tokens, boxes=boxes, return_tensors="tf") + batch_encoded_sequence = tokenizer.batch_encode_plus( + [first_ten_tokens, first_ten_tokens], boxes=[boxes, boxes], return_tensors="tf" + ) + + # This should not fail + model(encoded_sequence) + model(batch_encoded_sequence)