Honor existing attention mask in tokenzier.pad (#13926)

* Honor existing attention mask in tokenzier.pad

* Fix initialization of attention mask

* Roll the implem on all subclasses

* Fix tests
This commit is contained in:
Sylvain Gugger
2021-10-11 09:12:09 -04:00
committed by GitHub
parent 3c0c699ffd
commit 4a18337bae
7 changed files with 68 additions and 38 deletions

View File

@@ -1460,6 +1460,25 @@ class TokenizerTesterMixin:
pad_to_multiple_of=8,
)
def test_padding_with_attention_mask(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
if tokenizer.pad_token is None:
self.skipTest("No padding token.")
if "attention_mask" not in tokenizer.model_input_names:
self.skipTest("This model does not use attention mask.")
features = [
{"input_ids": [1, 2, 3, 4, 5, 6], "attention_mask": [1, 1, 1, 1, 1, 0]},
{"input_ids": [1, 2, 3], "attention_mask": [1, 1, 0]},
]
padded_features = tokenizer.pad(features)
if tokenizer.padding_side == "right":
self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [1, 1, 0, 0, 0, 0]])
else:
self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0]])
def test_encode_plus_with_padding(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers: