fix: Text splitting in the BasicTokenizer (#22280)
* fix: Apostraphe splitting in the BasicTokenizer for CLIPTokenizer * account for apostrophe at start of new word * remove _run_split_on_punc, use re.findall instead * remove debugging, make style and quality * use pattern and punc splitting, repo-consistency will fail * remove commented out debugging * adds bool args to BasicTokenizer, remove pattern * do_split_on_punc default True * clean stray comments and line breaks * rebase, repo-consistency * update to just do punctuation split * add unicode normalizing back * remove redundant line
This commit is contained in:
@@ -81,7 +81,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||
|
||||
text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat"
|
||||
text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
|
||||
text_tokenized_s = tokenizer_s.tokenize(text)
|
||||
text_tokenized_r = tokenizer_r.tokenize(text)
|
||||
|
||||
@@ -122,7 +122,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# "\u0085", # (next line)
|
||||
]
|
||||
|
||||
# The tokenization is not identical for the character "\u0085" (next line). The slow version transforms
|
||||
# The tokenization is not identical for the character "\u0085" (next line). The slow version using ftfy transforms
|
||||
# it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
|
||||
# space (and thus into an empty list).
|
||||
|
||||
|
||||
Reference in New Issue
Block a user