Add clean_up_tokenization_spaces to config (#22341)

* add draft changes

* fix failing wav2vec

* style

* make sure that the argument is saved + add tests

* style

* fixup

* update test

* default clean_up_tokenization_spaces to False for Bloom and Llama

* Update code based on review

Co-authored-by: Nicolas Patry <patry.nicolas@gmail.com>

* style

* quality

---------

Co-authored-by: Nicolas Patry <patry.nicolas@gmail.com>
This commit is contained in:
Arthur
2023-03-29 13:21:07 +02:00
committed by GitHub
parent b29fd6971d
commit 8d9c3836be
16 changed files with 150 additions and 42 deletions

View File

@@ -3895,6 +3895,51 @@ class TokenizerTesterMixin:
# Should not raise an error
self.rust_tokenizer_class.from_pretrained(tmp_dir_2)
def test_clean_up_tokenization_spaces(self):
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
assert tokenizer.clean_up_tokenization_spaces is True
tokens = tokenizer.encode("This shouldn't be! He'll go.")
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
tokenizer.clean_up_tokenization_spaces = False
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
assert decoded == tokenizer.decode(tokens, clean_up_tokenization_spaces=False)
# Fast from slow
with tempfile.TemporaryDirectory() as tmp_dir_2:
tokenizer.save_pretrained(tmp_dir_2)
tokenizer_fast = BertTokenizerFast.from_pretrained(tmp_dir_2)
del tokenizer
assert tokenizer_fast.clean_up_tokenization_spaces is False
decoded = tokenizer_fast.decode(tokens)
# fast and slow don't have the same output when we don't cleanup
# tokenization space. Here `be!` vs `be !` and `go.` vs `go .`
assert decoded == "[CLS] this shouldn ' t be! he ' ll go. [SEP]"
tokenizer_fast.clean_up_tokenization_spaces = True
assert tokenizer_fast.clean_up_tokenization_spaces is True
decoded = tokenizer_fast.decode(tokens)
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
# Slow from fast
with tempfile.TemporaryDirectory() as tmp_dir_2:
tokenizer_fast.clean_up_tokenization_spaces = False
tokenizer_fast.save_pretrained(tmp_dir_2)
tokenizer = BertTokenizer.from_pretrained(tmp_dir_2)
assert tokenizer_fast.clean_up_tokenization_spaces is False
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
tokenizer.clean_up_tokenization_spaces = True
decoded = tokenizer.decode(tokens)
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
class TokenizerUtilTester(unittest.TestCase):
def test_cached_files_are_used_when_internet_is_down(self):