🚨 rm already deprecated pad_to_max_length arg (#37617)
* rm already deprecated padding max length * truncate_strategy AS AN ARG is already deprecated for a few years * fix * rm test_padding_to_max_length * rm pad_to_max_length=True in other tests * rm from common * missed fnet
This commit is contained in:
@@ -566,41 +566,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
|
||||
)
|
||||
|
||||
def test_padding_to_max_length(self):
|
||||
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
padding_size = 10
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, words)
|
||||
|
||||
padding_idx = tokenizer.pad_token_id
|
||||
|
||||
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
||||
tokenizer.padding_side = "right"
|
||||
encoded_sequence = tokenizer.encode(words, boxes=boxes)
|
||||
sequence_length = len(encoded_sequence)
|
||||
# FIXME: the next line should be padding(max_length) to avoid warning
|
||||
padded_sequence = tokenizer.encode(
|
||||
words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
|
||||
)
|
||||
padded_sequence_length = len(padded_sequence)
|
||||
assert sequence_length + padding_size == padded_sequence_length
|
||||
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
|
||||
|
||||
# Check that nothing is done when a maximum length is not specified
|
||||
encoded_sequence = tokenizer.encode(words, boxes=boxes)
|
||||
sequence_length = len(encoded_sequence)
|
||||
|
||||
tokenizer.padding_side = "right"
|
||||
padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True)
|
||||
padded_sequence_right_length = len(padded_sequence_right)
|
||||
assert sequence_length == padded_sequence_right_length
|
||||
assert encoded_sequence == padded_sequence_right
|
||||
|
||||
def test_padding(self, max_length=50):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
@@ -612,9 +577,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode - Simple input
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
@@ -625,13 +587,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode - Pair input
|
||||
question, words, boxes = self.get_question_words_and_boxes()
|
||||
input_r = tokenizer_r.encode(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
@@ -641,10 +596,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode_plus - Simple input
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
@@ -660,14 +611,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode_plus - Pair input
|
||||
question, words, boxes = self.get_question_words_and_boxes()
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
question, words, boxes=boxes, max_length=max_length, padding="max_length"
|
||||
)
|
||||
@@ -686,20 +629,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# Batch_encode_plus - Simple input
|
||||
words, boxes = self.get_words_and_boxes_batch()
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
|
||||
Reference in New Issue
Block a user