Tokenizers API developments (#5103)
* Add return lengths * make pad a bit more flexible so it can be used as collate_fn * check all kwargs sent to encoding method are known * fixing kwargs in encodings * New AddedToken class in python This class let you specify specifique tokenization behaviors for some special tokens. Used in particular for GPT2 and Roberta, to control how white spaces are stripped around special tokens. * style and quality * switched to hugginface tokenizers library for AddedTokens * up to tokenizer 0.8.0-rc3 - update API to use AddedToken state * style and quality * do not raise an error on additional or unused kwargs for tokenize() but only a warning * transfo-xl pretrained model requires torch * Update src/transformers/tokenization_utils.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
@@ -390,7 +390,7 @@ class TokenizerTesterMixin:
|
||||
seq_1 = "With these inputs."
|
||||
|
||||
sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
|
||||
attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True, add_prefix_space=False)
|
||||
attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
|
||||
|
||||
# Method is implemented (e.g. not GPT-2)
|
||||
if len(attached_sequences) != 2:
|
||||
@@ -416,7 +416,7 @@ class TokenizerTesterMixin:
|
||||
stride=stride,
|
||||
truncation="longest_first",
|
||||
return_overflowing_tokens=True,
|
||||
add_prefix_space=False,
|
||||
# add_prefix_space=False,
|
||||
)
|
||||
|
||||
# Overflowing tokens are handled quite differently in slow and fast tokenizers
|
||||
@@ -468,7 +468,7 @@ class TokenizerTesterMixin:
|
||||
|
||||
# We are not using the special tokens - a bit too hard to test all the tokenizers with this
|
||||
# TODO try this again later
|
||||
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=False, add_prefix_space=False)
|
||||
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=False) # , add_prefix_space=False)
|
||||
truncated_first_sequence = tokenizer.encode(seq_0, add_special_tokens=False)[:-2] + tokenizer.encode(
|
||||
seq_1, add_special_tokens=False
|
||||
)
|
||||
@@ -499,7 +499,7 @@ class TokenizerTesterMixin:
|
||||
stride=stride,
|
||||
truncation="longest_first",
|
||||
return_overflowing_tokens=True,
|
||||
add_prefix_space=False,
|
||||
# add_prefix_space=False,
|
||||
)
|
||||
# Overflowing tokens are handled quite differently in slow and fast tokenizers
|
||||
if isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
@@ -531,7 +531,7 @@ class TokenizerTesterMixin:
|
||||
stride=stride,
|
||||
truncation=True,
|
||||
return_overflowing_tokens=True,
|
||||
add_prefix_space=False,
|
||||
# add_prefix_space=False,
|
||||
)
|
||||
# Overflowing tokens are handled quite differently in slow and fast tokenizers
|
||||
if isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
@@ -562,7 +562,7 @@ class TokenizerTesterMixin:
|
||||
stride=stride,
|
||||
truncation="only_second",
|
||||
return_overflowing_tokens=True,
|
||||
add_prefix_space=False,
|
||||
# add_prefix_space=False,
|
||||
)
|
||||
# Overflowing tokens are handled quite differently in slow and fast tokenizers
|
||||
if isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
@@ -638,7 +638,7 @@ class TokenizerTesterMixin:
|
||||
# Testing single inputs
|
||||
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
|
||||
encoded_sequence_dict = tokenizer.encode_plus(
|
||||
sequence_0, add_special_tokens=True, return_special_tokens_mask=True, add_prefix_space=False
|
||||
sequence_0, add_special_tokens=True, return_special_tokens_mask=True # , add_prefix_space=False
|
||||
)
|
||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||
@@ -660,7 +660,7 @@ class TokenizerTesterMixin:
|
||||
sequence_1,
|
||||
add_special_tokens=True,
|
||||
return_special_tokens_mask=True,
|
||||
add_prefix_space=False,
|
||||
# add_prefix_space=False,
|
||||
)
|
||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
||||
@@ -1042,7 +1042,7 @@ class TokenizerTesterMixin:
|
||||
def test_pretokenized_inputs(self):
|
||||
# Test when inputs are pretokenized
|
||||
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False, add_prefix_space=True)
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False) # , add_prefix_space=True)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
|
||||
|
||||
@@ -63,6 +63,21 @@ class CommonFastTokenizerTest(unittest.TestCase):
|
||||
self.fast_align_python(tokenizer_r, tokenizer_p, tok_case, pretrained_name)
|
||||
self.fast_only(tokenizer_r)
|
||||
|
||||
def test_pretokenized_tokenizers(self):
|
||||
for tok_case in self.TOKENIZERS_CLASSES:
|
||||
for pretrained_name in tok_case.python_cls.pretrained_vocab_files_map[tok_case.vocab_key].keys():
|
||||
|
||||
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
|
||||
# information available in Tokenizer (name, rust class, python class, vocab key name)
|
||||
if tok_case.filter is None or (
|
||||
tok_case.filter is not None and tok_case.filter(tok_case, pretrained_name)
|
||||
):
|
||||
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
|
||||
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, add_prefix_space=True)
|
||||
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, add_prefix_space=True)
|
||||
|
||||
self.assert_pretokenized_inputs(tokenizer_r, tokenizer_p)
|
||||
|
||||
def fast_align_python(self, tokenizer_r, tokenizer_p, tok_case, pretrained_name):
|
||||
# Check is_fast is set correctly
|
||||
self.assertFalse(tokenizer_p.is_fast)
|
||||
@@ -75,7 +90,6 @@ class CommonFastTokenizerTest(unittest.TestCase):
|
||||
self.assert_special_tokens_map_equal(tokenizer_r, tokenizer_p)
|
||||
self.assert_embeded_special_tokens(tokenizer_r, tokenizer_p)
|
||||
self.assert_padding(tokenizer_r, tokenizer_p)
|
||||
self.assert_pretokenized_inputs(tokenizer_r, tokenizer_p)
|
||||
self.assert_create_token_type_ids(tokenizer_r, tokenizer_p)
|
||||
# TODO: enable for v3.0.0
|
||||
# self.assert_empty_output_no_special_tokens(tokenizer_r, tokenizer_p)
|
||||
@@ -341,6 +355,14 @@ class CommonFastTokenizerTest(unittest.TestCase):
|
||||
"return_special_tokens_mask": True,
|
||||
"return_offsets_mapping": False, # Not implemented in python tokenizers
|
||||
}
|
||||
batch_kwargs = {
|
||||
"is_pretokenized": True,
|
||||
"return_token_type_ids": True,
|
||||
"return_attention_mask": True, # we have an 's' here
|
||||
"return_overflowing_tokens": False,
|
||||
"return_special_tokens_mask": True, # we have an 's' here
|
||||
"return_offsets_mapping": False, # Not implemented in python tokenizers
|
||||
}
|
||||
# Test encode_plus for pretokenized inputs
|
||||
output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs)
|
||||
output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs)
|
||||
@@ -349,8 +371,8 @@ class CommonFastTokenizerTest(unittest.TestCase):
|
||||
|
||||
# Test batch_encode_plus for pretokenized inputs
|
||||
input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair]
|
||||
output_r = tokenizer_r.batch_encode_plus(input_batch, **kwargs)
|
||||
output_p = tokenizer_p.batch_encode_plus(input_batch, **kwargs)
|
||||
output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs)
|
||||
output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs)
|
||||
for key in output_p.keys():
|
||||
self.assertEqual(output_p[key], output_r[key])
|
||||
|
||||
@@ -370,8 +392,8 @@ class CommonFastTokenizerTest(unittest.TestCase):
|
||||
pretokenized_input_simple + pretokenized_input_pair,
|
||||
pretokenized_input_pair,
|
||||
]
|
||||
output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **kwargs)
|
||||
output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **kwargs)
|
||||
output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs)
|
||||
output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs)
|
||||
for key in output_p.keys():
|
||||
self.assertEqual(output_p[key], output_r[key])
|
||||
|
||||
@@ -756,8 +778,8 @@ class RobertaFastTokenizerTest(CommonFastTokenizerTest):
|
||||
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||
|
||||
# Rust correctly handles the space before the mask while python doesnt
|
||||
self.assertSequenceEqual(tokens_r["input_ids"], [0, 83, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
|
||||
self.assertSequenceEqual(tokens_p["input_ids"], [0, 83, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
|
||||
self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
|
||||
self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
|
||||
|
||||
# token_type_ids should put 0 everywhere
|
||||
self.assertEquals(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
|
||||
@@ -768,9 +790,10 @@ class RobertaFastTokenizerTest(CommonFastTokenizerTest):
|
||||
sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
|
||||
)
|
||||
|
||||
# Rust should have 'Ġ' before <mask> which should be left as an entire token
|
||||
tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
|
||||
self.assertSequenceEqual(tokens_r, ["<s>", "ĠA", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"])
|
||||
tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
|
||||
self.assertSequenceEqual(tokens_r, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"])
|
||||
self.assertSequenceEqual(tokens_p, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"])
|
||||
|
||||
|
||||
class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
|
||||
@@ -840,3 +863,7 @@ class TransfoXLFastTokenizerTest(NoPaddingTokenFastTokenizerMatchingTest):
|
||||
@require_torch
|
||||
def test_all_tokenizers(self):
|
||||
super().test_all_tokenizers()
|
||||
|
||||
@require_torch
|
||||
def test_pretokenized_tokenizers(self):
|
||||
super().test_pretokenized_tokenizers()
|
||||
|
||||
@@ -80,12 +80,12 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
||||
text = "lower newer"
|
||||
bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
|
||||
tokens = tokenizer.tokenize(text, add_prefix_space=True)
|
||||
bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
|
||||
tokens = tokenizer.tokenize(text) # , add_prefix_space=True)
|
||||
self.assertListEqual(tokens, bpe_tokens)
|
||||
|
||||
input_tokens = tokens + [tokenizer.unk_token]
|
||||
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
|
||||
input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||
|
||||
def roberta_dict_integration_testing(self):
|
||||
@@ -124,7 +124,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
space_encoding = tokenizer.byte_encoder[" ".encode("utf-8")[0]]
|
||||
|
||||
# Testing encoder arguments
|
||||
encoded = tokenizer.encode(sequence, add_special_tokens=False)
|
||||
encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=False)
|
||||
first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
|
||||
self.assertNotEqual(first_char, space_encoding)
|
||||
|
||||
@@ -135,7 +135,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.add_special_tokens({"bos_token": "<s>"})
|
||||
encoded = tokenizer.encode(sequence, add_special_tokens=True)
|
||||
first_char = tokenizer.convert_ids_to_tokens(encoded[1])[0]
|
||||
self.assertEqual(first_char, space_encoding)
|
||||
self.assertNotEqual(first_char, space_encoding)
|
||||
|
||||
# Testing spaces after special tokenss
|
||||
mask = "<mask>"
|
||||
|
||||
Reference in New Issue
Block a user