From 818997788584b9fc043d8b58e078f63aadb6b60e Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Thu, 18 Jan 2024 12:31:54 +0100 Subject: [PATCH] [`Core Tokenization`] Support a fix for spm fast models (#26678) * fix * last attempt * current work * fix forward compatibility * save all special tokens * current state * revert additional changes * updates * remove tokenizer.model * add a test and the fix * nit * revert one more break * fix typefield issue * quality * more tests * fix fields for FC * more nits? * new additional changes * how * some updates * the fix * where do we stand * nits * nits * revert unrelated changes * nits nits nits * styling * don't break llama just yet * revert llama changes * safe arg check * fixup * Add a test for T5 * Necessary changes * Tests passing, added tokens need to not be normalized. If the added tokens are normalized, it will the stripping which seems to be unwanted for a normal functioning * Add even more tests, when normalization is set to True (which does not work :sweat: ) * Add even more tests, when normalization is set to True (which does not work :sweat: ) * Update to main * nits * fmt * more and more test * comments * revert change as tests are failing * make the test more readble * nits * refactor the test * nit * updates * simplify * style * style * style convert slow * Update src/transformers/convert_slow_tokenizer.py --- src/transformers/convert_slow_tokenizer.py | 17 +++++++---- tests/models/t5/test_tokenization_t5.py | 35 ++++++++++++++++++++++ 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 46f2b2dc23..53dbfeb6b6 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -552,15 +552,22 @@ class SpmConverter(Converter): def normalizer(self, proto): precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap + _normalizers = [ + normalizers.Strip(left=False, right=True), # stripping is important + normalizers.Replace(Regex(" {2,}"), "▁"), + ] if not precompiled_charsmap: - return normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")]) + return normalizers.Sequence(_normalizers) else: - return normalizers.Sequence( - [normalizers.Precompiled(precompiled_charsmap), normalizers.Replace(Regex(" {2,}"), " ")] - ) + return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers) def pre_tokenizer(self, replacement, add_prefix_space): - return pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) + prepend_scheme = "always" + if hasattr(self.original_tokenizer, "legacy") and not self.original_tokenizer.legacy: + prepend_scheme = "first" + return pre_tokenizers.Metaspace( + replacement=replacement, add_prefix_space=add_prefix_space, prepend_scheme=prepend_scheme + ) def post_processor(self): return None diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py index a141dea86b..5fa0e19c79 100644 --- a/tests/models/t5/test_tokenization_t5.py +++ b/tests/models/t5/test_tokenization_t5.py @@ -424,6 +424,41 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertEqual(tokens, []) self.assertEqual(tokens, tokenizer.sp_model.encode("▁", out_type=str)) + def test_fast_slow_edge_cases(self): + # We are testing spaces before and spaces after special tokens + space transformations + slow_tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False) + fast_tokenizer = T5TokenizerFast.from_pretrained("t5-base", legacy=False, from_slow=True) + slow_tokenizer.add_tokens(AddedToken("", rstrip=False, lstrip=False, normalized=False)) + fast_tokenizer.add_tokens(AddedToken("", rstrip=False, lstrip=False, normalized=False)) + + edge_case = "Hey!. HowHey !" + EXPECTED_SLOW = ["▁Hey", "!", "", ".", "▁How", "", "He", "y", "", "!"] # fmt: skip + with self.subTest(f"slow {edge_case} normalized = False"): + self.assertEqual(slow_tokenizer.tokenize(edge_case), EXPECTED_SLOW) + with self.subTest(f"Fast {edge_case} normalized = False"): + self.assertEqual(fast_tokenizer.tokenize(edge_case), EXPECTED_SLOW) + + hard_case = "Hey! . How Hey ! . " + EXPECTED_SLOW = ["▁Hey", "!", "", ".", "▁How", "", "▁Hey", "", "▁", "!", "▁", "."] # fmt: skip + with self.subTest(f"slow {edge_case} normalized = False"): + self.assertEqual(slow_tokenizer.tokenize(hard_case), EXPECTED_SLOW) + with self.subTest(f"fast {edge_case} normalized = False"): + self.assertEqual(fast_tokenizer.tokenize(hard_case), EXPECTED_SLOW) + + fast_tokenizer = T5TokenizerFast.from_pretrained("t5-base", legacy=False, from_slow=True) + fast_tokenizer.add_tokens(AddedToken("", rstrip=False, lstrip=False, normalized=True)) + + # `normalized=True` is the default normalization scheme when adding a token. Normalize -> don't strip the space. + # the issue now is that our slow tokenizer should NOT strip the space if we want to simulate sentencepiece token addition. + + EXPECTED_FAST = ["▁Hey", "!", "", ".", "▁How", "", "He", "y", "▁", "", "!"] # fmt: skip + with self.subTest(f"fast {edge_case} normalized = True"): + self.assertEqual(fast_tokenizer.tokenize(edge_case), EXPECTED_FAST) + + EXPECTED_FAST = ['▁Hey', '!', '▁', '', '.', '▁How', '', '▁Hey','▁', '', '▁', '!', '▁', '.'] # fmt: skip + with self.subTest(f"fast {edge_case} normalized = False"): + self.assertEqual(fast_tokenizer.tokenize(hard_case), EXPECTED_FAST) + @require_sentencepiece @require_tokenizers