fix: Text splitting in the BasicTokenizer (#22280)

* fix: Apostraphe splitting in the BasicTokenizer for CLIPTokenizer * account for apostrophe at start of new word * remove _run_split_on_punc, use re.findall instead * remove debugging, make style and quality * use pattern and punc splitting, repo-consistency will fail * remove commented out debugging * adds bool args to BasicTokenizer, remove pattern * do_split_on_punc default True * clean stray comments and line breaks * rebase, repo-consistency * update to just do punctuation split * add unicode normalizing back * remove redundant line
2023-07-11 11:07:58 -04:00
parent 2489e380e4
commit 5739726fcc
22 changed files with 349 additions and 103 deletions
--- a/tests/models/bert/test_tokenization_bert.py
+++ b/tests/models/bert/test_tokenization_bert.py
@@ -182,6 +182,12 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
        )

+    def test_basic_tokenizer_splits_on_punctuation(self):
+        tokenizer = BasicTokenizer()
+        text = "a\n'll !!to?'d of, can't."
+        expected = ["a", "'", "ll", "!", "!", "to", "?", "'", "d", "of", ",", "can", "'", "t", "."]
+        self.assertListEqual(tokenizer.tokenize(text), expected)
+
    def test_wordpiece_tokenizer(self):
        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]

--- a/tests/models/clip/test_tokenization_clip.py
+++ b/tests/models/clip/test_tokenization_clip.py
@@ -81,7 +81,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)

-                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
+                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
                text_tokenized_s = tokenizer_s.tokenize(text)
                text_tokenized_r = tokenizer_r.tokenize(text)

@@ -122,7 +122,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                    # "\u0085", # (next line)
                ]

-                # The tokenization is not identical for the character "\u0085" (next line). The slow version transforms
+                # The tokenization is not identical for the character "\u0085" (next line). The slow version using ftfy transforms
                # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
                # space (and thus into an empty list).