[Patch-t5-tokenizer] Patches the changes on T5 to make sure previous behaviour is still valide for beginning of words (#24622)

* patch `_tokenize` function

* more tests

* properly fix

* fixup

* Update src/transformers/models/t5/tokenization_t5.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* fix without ifs

* update

* protect import

* add python processing

* is first needed

* add doc and update with lefacy

* updaate

* fix T5 SPM converter

* styling

* fix T5 warning

* add is_seqio_available

* remove is_first

* revert some changes

* more tests and update

* update llama test batterie

* fixup

* refactor T5 spm common tests

* draft the llama tests

* update

* uopdate test

* nits

* refine

* name nit

* fix t5 tests

* fix T5

* update

* revert convert slow to fast changes that fail lots of tests

* legacy support

* fixup

* nits is first not defined

* don't use legacy behaviour for switch transformers

* style

* My attempt to check.

* nits

* fixes

* update

* fixup

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* updates

* fixup

* add legacy warning

* fixup

* warning_once nit

* update t5 documentation test

* update llama tok documentation

* add space to warning

* nits

* nit

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* last nits

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
Arthur
2023-07-11 22:02:18 +09:00
committed by GitHub
parent b3ab3fac1d
commit b15343de6f
11 changed files with 387 additions and 52 deletions

View File

@@ -498,3 +498,89 @@ class LlamaIntegrationTest(unittest.TestCase):
decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
self.assertEqual(decoded1, decoded2)
@require_sentencepiece
@require_tokenizers
class CommonSpmIntegrationTests(unittest.TestCase):
"""
A class that regroups important test to make sure that we properly handle the special tokens.
"""
@classmethod
def setUpClass(cls):
tokenizer = LlamaTokenizer(SAMPLE_VOCAB, extra_ids=0, add_bos_token=False, legacy=False)
tokenizer.add_special_tokens({"additional_special_tokens": ["<s>"]})
tokenizer._create_trie(tokenizer.all_special_tokens)
# TODO ArthurZ the above is necessary as addedTokens / intialization sucks. Trie is not correctly created
# So the extra ids are split....
cls.tokenizer = tokenizer
return cls
def test_add_dummy_prefix(self):
# make sure `'▁'` is prepended, and outputs match sp_model's
# `sentencepiece.NormalizerSpec.add_dummy_prefix` attribute
input_ids = self.tokenizer.encode(". Hello")
self.assertEqual(input_ids, [7, 4, 156, 86, 20])
sp_encode = self.tokenizer.sp_model.encode(". Hello")
self.assertEqual(input_ids, sp_encode)
tokens = self.tokenizer.tokenize(". Hello")
self.assertEqual(tokens, ["", ".", "▁He", "ll", "o"])
def test_remove_extra_whitespaces(self):
# make sure the extra spaces are eaten. Since the sample vocab does not have
# `______`. sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute is set to False
input_ids = self.tokenizer.encode(" . Hello")
self.assertEqual(input_ids, [7, 4, 156, 86, 20])
sp_encode = self.tokenizer.sp_model.encode(" . Hello")
self.assertEqual(input_ids, sp_encode)
tokens = self.tokenizer.tokenize(" . Hello")
self.assertEqual(tokens, ["", ".", "▁He", "ll", "o"])
# `'▁'` is also a whitespace
input_ids = self.tokenizer.encode("▁He is not")
self.assertEqual(input_ids, [156, 46, 44])
tokens = self.tokenizer.tokenize("▁He is not")
sp_encode = self.tokenizer.sp_model.encode("▁He is not")
self.assertEqual(input_ids, sp_encode)
self.assertEqual(tokens, ["▁He", "▁is", "▁not"]) # no extra space added
input_ids = self.tokenizer.encode("▁He is not<s> ▁He")
self.assertEqual(input_ids, [156, 46, 44, 1, 156])
tokens = self.tokenizer.tokenize("▁He is not<s> ▁He")
self.assertEqual(tokens, ["▁He", "▁is", "▁not", "<s>", "▁He"]) # spaces are eaten by spm + our strip
# make sure that the output after the extra id is the same as if
# extra_id was not there
input_ids = self.tokenizer.encode("▁He is not ▁He")
self.assertEqual(input_ids, [156, 46, 44, 156])
tokens = self.tokenizer.tokenize("▁He is not ▁He")
self.assertEqual(tokens, ["▁He", "▁is", "▁not", "▁He"]) # spaces are eaten by spm even if not start
def test_character_after_special_token(self):
# Make sure that `tokenizer.tokenize` is similar to
# adding the equivalent special token to the vocab
input_ids = self.tokenizer.encode("Hey <s>I")
self.assertEqual(input_ids, [156, 30, 1, 100])
sp_encode = self.tokenizer.sp_model.encode("Hey .I")
# the last token should be 100
self.assertEqual(input_ids[-1], sp_encode[-1])
tokens = self.tokenizer.tokenize("<s>I")
self.assertEqual(tokens, ["<s>", "I"])
input_ids = self.tokenizer.encode("Hello, <s>,")
self.assertEqual(input_ids, [156, 86, 20, 3, 1, 3])
tokens = self.tokenizer.tokenize("Hello, <s>,")
self.assertEqual(tokens, ["▁He", "ll", "o", ",", "<s>", ","])
def test_special_tokens_strip(self):
input_ids = self.tokenizer.encode(" <s> ,")
self.assertEqual(input_ids, [1, 7, 3])
tokens = self.tokenizer.tokenize(" <s> ,")
# spaces are eaten by rstrip / lstrip + spm sp_model.encode(" ") = []
self.assertEqual(tokens, ["<s>", "", ","])
input_ids = self.tokenizer.encode("No <s> ▁He")
self.assertEqual(input_ids, [284, 1, 156])
tokens = self.tokenizer.tokenize("No <s> ▁He")
self.assertEqual(tokens, ["▁No", "<s>", "▁He"]) # spaces are eaten by rstrip / lstrip