🚨 rm already deprecated pad_to_max_length arg (#37617)

* rm already deprecated padding max length

* truncate_strategy AS AN ARG is already deprecated for a few years

* fix

* rm test_padding_to_max_length

* rm pad_to_max_length=True in other tests

* rm from common

* missed fnet
This commit is contained in:
Ita Zaporozhets
2025-05-01 15:21:55 +02:00
committed by GitHub
parent 7a3e208892
commit c80f65265b
11 changed files with 5 additions and 546 deletions

View File

@@ -1074,10 +1074,10 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
def _convert_to_features(example_batch):
input_encodings = tokenizer.batch_encode_plus(
example_batch["input_text"], pad_to_max_length=True, max_length=512, truncation=True
example_batch["input_text"], padding="max_length", max_length=512, truncation=True
)
target_encodings = tokenizer.batch_encode_plus(
example_batch["target_text"], pad_to_max_length=True, max_length=16, truncation=True
example_batch["target_text"], padding="max_length", max_length=16, truncation=True
)
encodings = {

View File

@@ -829,7 +829,6 @@ class BertEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
input_dict = tokenizer(
[ARTICLE_SIGMA, ARTICLE_AMERICA],
padding="max_length",
pad_to_max_length=True,
max_length=512,
return_tensors="pt",
)

View File

@@ -205,9 +205,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
pad_token_id = tokenizer_p.pad_token_id
# Encode - Simple input
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
@@ -217,13 +214,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
# Encode - Pair input
input_r = tokenizer_r.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
@@ -236,14 +226,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
# Encode_plus - Simple input
input_r = tokenizer_r.encode_plus(
"This is a simple input", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
input_r = tokenizer_r.encode_plus(
"This is a simple input", max_length=max_length, padding="max_length"
)
@@ -259,14 +241,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
)
# Encode_plus - Pair input
input_r = tokenizer_r.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
input_r = tokenizer_r.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
@@ -282,18 +256,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
)
# Batch_encode_plus - Simple input
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
pad_to_max_length=True,
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
pad_to_max_length=True,
)
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,

View File

@@ -566,41 +566,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
)
def test_padding_to_max_length(self):
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, words)
padding_idx = tokenizer.pad_token_id
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode(words, boxes=boxes)
sequence_length = len(encoded_sequence)
# FIXME: the next line should be padding(max_length) to avoid warning
padded_sequence = tokenizer.encode(
words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
# Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer.encode(words, boxes=boxes)
sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True)
padded_sequence_right_length = len(padded_sequence_right)
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -612,9 +577,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode - Simple input
words, boxes = self.get_words_and_boxes()
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
@@ -625,13 +587,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode - Pair input
question, words, boxes = self.get_question_words_and_boxes()
input_r = tokenizer_r.encode(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
@@ -641,10 +596,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode_plus - Simple input
words, boxes = self.get_words_and_boxes()
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
@@ -660,14 +611,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode_plus - Pair input
question, words, boxes = self.get_question_words_and_boxes()
input_r = tokenizer_r.encode_plus(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
question, words, boxes=boxes, max_length=max_length, padding="max_length"
)
@@ -686,20 +629,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Batch_encode_plus - Simple input
words, boxes = self.get_words_and_boxes_batch()
input_r = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,
max_length=max_length,
pad_to_max_length=True,
)
input_p = tokenizer_p.batch_encode_plus(
words,
boxes=boxes,
max_length=max_length,
pad_to_max_length=True,
)
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,

View File

@@ -460,41 +460,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
)
def test_padding_to_max_length(self):
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, words)
padding_idx = tokenizer.pad_token_id
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode(words, boxes=boxes)
sequence_length = len(encoded_sequence)
# FIXME: the next line should be padding(max_length) to avoid warning
padded_sequence = tokenizer.encode(
words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
# Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer.encode(words, boxes=boxes)
sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True)
padded_sequence_right_length = len(padded_sequence_right)
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -506,9 +471,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode - Simple input
words, boxes = self.get_words_and_boxes()
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
@@ -519,13 +481,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode - Pair input
question, words, boxes = self.get_question_words_and_boxes()
input_r = tokenizer_r.encode(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
@@ -535,10 +490,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode_plus - Simple input
words, boxes = self.get_words_and_boxes()
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
@@ -554,14 +505,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode_plus - Pair input
question, words, boxes = self.get_question_words_and_boxes()
input_r = tokenizer_r.encode_plus(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
question, words, boxes=boxes, max_length=max_length, padding="max_length"
)
@@ -580,20 +523,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Batch_encode_plus - Simple input
words, boxes = self.get_words_and_boxes_batch()
input_r = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,
max_length=max_length,
pad_to_max_length=True,
)
input_p = tokenizer_p.batch_encode_plus(
words,
boxes=boxes,
max_length=max_length,
pad_to_max_length=True,
)
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,

View File

@@ -497,41 +497,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
)
def test_padding_to_max_length(self):
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, words)
padding_idx = tokenizer.pad_token_id
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode(words, boxes=boxes)
sequence_length = len(encoded_sequence)
# FIXME: the next line should be padding(max_length) to avoid warning
padded_sequence = tokenizer.encode(
words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
# Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer.encode(words, boxes=boxes)
sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True)
padded_sequence_right_length = len(padded_sequence_right)
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -543,9 +508,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode - Simple input
words, boxes = self.get_words_and_boxes()
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
@@ -556,13 +518,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode - Pair input
question, words, boxes = self.get_question_words_and_boxes()
input_r = tokenizer_r.encode(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
@@ -572,10 +527,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode_plus - Simple input
words, boxes = self.get_words_and_boxes()
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
@@ -591,14 +542,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode_plus - Pair input
question, words, boxes = self.get_question_words_and_boxes()
input_r = tokenizer_r.encode_plus(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
question, words, boxes=boxes, max_length=max_length, padding="max_length"
)
@@ -617,20 +560,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Batch_encode_plus - Simple input
words, boxes = self.get_words_and_boxes_batch()
input_r = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,
max_length=max_length,
pad_to_max_length=True,
)
input_p = tokenizer_p.batch_encode_plus(
words,
boxes=boxes,
max_length=max_length,
pad_to_max_length=True,
)
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.batch_encode_plus(
words,
boxes=boxes,

View File

@@ -382,41 +382,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
)
def test_padding_to_max_length(self):
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
nodes, xpaths = self.get_nodes_and_xpaths()
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, nodes)
padding_idx = tokenizer.pad_token_id
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode(nodes, xpaths=xpaths)
sequence_length = len(encoded_sequence)
# FIXME: the next line should be padding(max_length) to avoid warning
padded_sequence = tokenizer.encode(
nodes, xpaths=xpaths, max_length=sequence_length + padding_size, pad_to_max_length=True
)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
# Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer.encode(nodes, xpaths=xpaths)
sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(nodes, xpaths=xpaths, pad_to_max_length=True)
padded_sequence_right_length = len(padded_sequence_right)
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -428,9 +393,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode - Simple input
nodes, xpaths = self.get_nodes_and_xpaths()
input_r = tokenizer_r.encode(nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode(nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(nodes, xpaths=xpaths, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode(nodes, xpaths=xpaths, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
@@ -441,13 +403,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode - Pair input
question, nodes, xpaths = self.get_question_nodes_and_xpaths()
input_r = tokenizer_r.encode(
question, nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode(
question, nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(
question, nodes, xpaths=xpaths, max_length=max_length, padding="max_length"
)
@@ -461,10 +416,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode_plus - Simple input
nodes, xpaths = self.get_nodes_and_xpaths()
input_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
@@ -480,14 +431,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode_plus - Pair input
question, nodes, xpaths = self.get_question_nodes_and_xpaths()
input_r = tokenizer_r.encode_plus(
question, nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
question, nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
question, nodes, xpaths=xpaths, max_length=max_length, padding="max_length"
)
@@ -506,20 +449,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Batch_encode_plus - Simple input
nodes, xpaths = self.get_nodes_and_xpaths_batch()
input_r = tokenizer_r.batch_encode_plus(
nodes,
xpaths=xpaths,
max_length=max_length,
pad_to_max_length=True,
)
input_p = tokenizer_p.batch_encode_plus(
nodes,
xpaths=xpaths,
max_length=max_length,
pad_to_max_length=True,
)
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.batch_encode_plus(
nodes,
xpaths=xpaths,

View File

@@ -657,42 +657,6 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
)
def test_padding_to_max_length(self):
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
table = self.get_table(tokenizer)
sequence = "Sequence"
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, sequence)
padding_idx = tokenizer.pad_token_id
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode(table, sequence)
sequence_length = len(encoded_sequence)
# FIXME: the next line should be padding(max_length) to avoid warning
padded_sequence = tokenizer.encode(
table, sequence, max_length=sequence_length + padding_size, padding=True
)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
# Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer.encode(table, sequence)
sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
padded_sequence_right_length = len(padded_sequence_right)
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus
tokenizers = self.get_tokenizers(do_lower_case=False)

View File

@@ -417,41 +417,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
)
def test_padding_to_max_length(self):
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
words, boxes = self.get_words_and_boxes()
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, words)
padding_idx = tokenizer.pad_token_id
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes)
sequence_length = len(encoded_sequence)
# FIXME: the next line should be padding(max_length) to avoid warning
padded_sequence = tokenizer.encode_boxes(
words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
# Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes)
sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode_boxes(words, boxes=boxes, pad_to_max_length=True)
padded_sequence_right_length = len(padded_sequence_right)
assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right
def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -463,9 +428,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode - Simple input
words, boxes = self.get_words_and_boxes()
input_r = tokenizer_r.encode_boxes(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode_boxes(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode_boxes(words, boxes=boxes, max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode_boxes(words, boxes=boxes, max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
@@ -476,13 +438,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode - Pair input
question, words, boxes = self.get_question_words_and_boxes()
input_r = tokenizer_r.encode_boxes(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_boxes(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode_boxes(
question, words, boxes=boxes, max_length=max_length, padding="max_length"
)
@@ -496,14 +451,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode_plus - Simple input
words, boxes = self.get_words_and_boxes()
input_r = tokenizer_r.encode_plus_boxes(
words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus_boxes(
words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus_boxes(
words, boxes=boxes, max_length=max_length, padding="max_length"
)
@@ -523,14 +470,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Encode_plus - Pair input
question, words, boxes = self.get_question_words_and_boxes()
input_r = tokenizer_r.encode_plus_boxes(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus_boxes(
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus_boxes(
question, words, boxes=boxes, max_length=max_length, padding="max_length"
)
@@ -549,20 +488,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Batch_encode_plus - Simple input
words, boxes = self.get_words_and_boxes_batch()
input_r = tokenizer_r.batch_encode_plus_boxes(
words,
boxes=boxes,
max_length=max_length,
pad_to_max_length=True,
)
input_p = tokenizer_p.batch_encode_plus_boxes(
words,
boxes=boxes,
max_length=max_length,
pad_to_max_length=True,
)
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.batch_encode_plus_boxes(
words,
boxes=boxes,

View File

@@ -2475,41 +2475,6 @@ class TokenizerTesterMixin:
self.assertEqual(sequence_length, truncated_sequence_left_length)
self.assertEqual(encoded_sequence, truncated_sequence_left)
def test_padding_to_max_length(self):
"""We keep this test for backward compatibility but it should be remove when `pad_to_max_length` is deprecated."""
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
sequence = "Sequence"
padding_size = 10
# check correct behaviour if no pad_token_id exists and add it eventually
self._check_no_pad_token_padding(tokenizer, sequence)
padding_idx = tokenizer.pad_token_id
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode(sequence)
sequence_length = len(encoded_sequence)
# FIXME: the next line should be padding(max_length) to avoid warning
padded_sequence = tokenizer.encode(
sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
)
padded_sequence_length = len(padded_sequence)
self.assertEqual(sequence_length + padding_size, padded_sequence_length)
self.assertEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
# Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer.encode(sequence)
sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
padded_sequence_right_length = len(padded_sequence_right)
self.assertEqual(sequence_length, padded_sequence_right_length)
self.assertEqual(encoded_sequence, padded_sequence_right)
def test_padding_to_multiple_of(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
@@ -3900,9 +3865,6 @@ class TokenizerTesterMixin:
pad_token_id = tokenizer_p.pad_token_id
# Encode - Simple input
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
@@ -3912,13 +3874,6 @@ class TokenizerTesterMixin:
self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
# Encode - Pair input
input_r = tokenizer_r.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.encode(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
@@ -3931,14 +3886,6 @@ class TokenizerTesterMixin:
self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
# Encode_plus - Simple input
input_r = tokenizer_r.encode_plus(
"This is a simple input", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
"This is a simple input", max_length=max_length, padding="max_length"
)
@@ -3957,14 +3904,6 @@ class TokenizerTesterMixin:
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
# Encode_plus - Pair input
input_r = tokenizer_r.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
input_p = tokenizer_p.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
)
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
@@ -3981,18 +3920,6 @@ class TokenizerTesterMixin:
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
# Batch_encode_plus - Simple input
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
pad_to_max_length=True,
)
input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,
pad_to_max_length=True,
)
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
input_r = tokenizer_r.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length,