* fix #5081 and improve backward compatibility (slightly) * add nlp to setup.cfg - style and quality * align default to previous default * remove test that doesn't generalize
This commit is contained in:
@@ -672,29 +672,6 @@ class TokenizerTesterMixin:
|
||||
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
||||
self.assertEqual(encoded_sequence, filtered_sequence)
|
||||
|
||||
def test_special_tokens_mask_already_has_special_tokens(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
if not hasattr(tokenizer, "get_special_tokens_mask") or tokenizer.get_special_tokens_mask(
|
||||
[0, 1, 2, 3]
|
||||
) == [0, 0, 0, 0]:
|
||||
continue
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
sequence_0 = "Encode this."
|
||||
if (
|
||||
tokenizer.cls_token_id == tokenizer.unk_token_id
|
||||
and tokenizer.cls_token_id == tokenizer.unk_token_id
|
||||
):
|
||||
tokenizer.add_special_tokens({"cls_token": "</s>", "sep_token": "<s>"})
|
||||
encoded_sequence_dict = tokenizer.encode_plus(
|
||||
sequence_0, add_special_tokens=True, return_special_tokens_mask=True
|
||||
)
|
||||
# encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||
special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
|
||||
min_val = min(special_tokens_mask_orig)
|
||||
max_val = max(special_tokens_mask_orig)
|
||||
self.assertNotEqual(min_val, max_val)
|
||||
|
||||
def test_right_and_left_padding(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
|
||||
Reference in New Issue
Block a user