Skipping outputs (#3116)

* Minimal example

* Proposal 2

* Proposal 2 for fast tokenizers

* Typings

* Docs

* Revert "Docs" for easier review

This reverts commit eaf0f97062e809887704a542144c537f769d5223.

* Remove unnecessary assignments

* Tests

* Fix faulty type

* Remove prints

* return_outputs -> model_input_names

* Revert "Revert "Docs" for easier review"

This reverts commit 6fdc69408102bf695797f2dfddbb6350c6b9e722.

* code quality
This commit is contained in:
Lysandre Debut
2020-03-09 13:48:58 -04:00
committed by GitHub
parent 49debe62fd
commit 5164ea91a7
4 changed files with 251 additions and 163 deletions

View File

@@ -48,7 +48,7 @@ class TokenizerTesterMixin:
# to the concatenated encode_plus format: [{'input_ids': [...], ...}, {'input_ids': [...], ...}]
return [
{value: batch_encode_plus_sequences[value][i] for value in batch_encode_plus_sequences.keys()}
for i in range(len(batch_encode_plus_sequences))
for i in range(len(batch_encode_plus_sequences["input_ids"]))
]
def test_tokenizers_common_properties(self):
@@ -261,7 +261,10 @@ class TokenizerTesterMixin:
def test_mask_output(self):
tokenizer = self.get_tokenizer()
if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
if (
tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
and "token_type_ids" in tokenizer.model_input_names
):
seq_0 = "Test this method."
seq_1 = "With these inputs."
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
@@ -504,51 +507,58 @@ class TokenizerTesterMixin:
encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
input_ids = encoded_sequence["input_ids"]
token_type_ids = encoded_sequence["token_type_ids"]
attention_mask = encoded_sequence["attention_mask"]
special_tokens_mask = encoded_sequence["special_tokens_mask"]
sequence_length = len(input_ids)
# Test right padding
tokenizer.padding_side = "right"
padded_sequence = tokenizer.encode_plus(
right_padded_sequence = tokenizer.encode_plus(
sequence,
max_length=sequence_length + padding_size,
pad_to_max_length=True,
return_special_tokens_mask=True,
)
padded_input_ids = padded_sequence["input_ids"]
padded_token_type_ids = padded_sequence["token_type_ids"]
padded_attention_mask = padded_sequence["attention_mask"]
padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
padded_sequence_length = len(padded_input_ids)
right_padded_input_ids = right_padded_sequence["input_ids"]
assert sequence_length + padding_size == padded_sequence_length
assert input_ids + [padding_idx] * padding_size == padded_input_ids
assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
assert attention_mask + [0] * padding_size == padded_attention_mask
assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
right_padded_sequence_length = len(right_padded_input_ids)
assert sequence_length + padding_size == right_padded_sequence_length
assert input_ids + [padding_idx] * padding_size == right_padded_input_ids
assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
# Test left padding
tokenizer.padding_side = "left"
padded_sequence = tokenizer.encode_plus(
left_padded_sequence = tokenizer.encode_plus(
sequence,
max_length=sequence_length + padding_size,
pad_to_max_length=True,
return_special_tokens_mask=True,
)
padded_input_ids = padded_sequence["input_ids"]
padded_token_type_ids = padded_sequence["token_type_ids"]
padded_attention_mask = padded_sequence["attention_mask"]
padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
padded_sequence_length = len(padded_input_ids)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)
assert sequence_length + padding_size == padded_sequence_length
assert [padding_idx] * padding_size + input_ids == padded_input_ids
assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
assert [0] * padding_size + attention_mask == padded_attention_mask
assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
assert sequence_length + padding_size == left_padded_sequence_length
assert [padding_idx] * padding_size + input_ids == left_padded_input_ids
assert [1] * padding_size + special_tokens_mask == left_padded_special_tokens_mask
if "token_type_ids" in tokenizer.model_input_names:
token_type_ids = encoded_sequence["token_type_ids"]
left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
assert token_type_ids + [token_type_padding_idx] * padding_size == right_padded_token_type_ids
assert [token_type_padding_idx] * padding_size + token_type_ids == left_padded_token_type_ids
if "attention_mask" in tokenizer.model_input_names:
attention_mask = encoded_sequence["attention_mask"]
right_padded_attention_mask = right_padded_sequence["attention_mask"]
left_padded_attention_mask = left_padded_sequence["attention_mask"]
assert attention_mask + [0] * padding_size == right_padded_attention_mask
assert [0] * padding_size + attention_mask == left_padded_attention_mask
def test_separate_tokenizers(self):
# This tests that tokenizers don't impact others. Unfortunately the case where it fails is when