Add clean_up_tokenization_spaces to config (#22341)
* add draft changes * fix failing wav2vec * style * make sure that the argument is saved + add tests * style * fixup * update test * default clean_up_tokenization_spaces to False for Bloom and Llama * Update code based on review Co-authored-by: Nicolas Patry <patry.nicolas@gmail.com> * style * quality --------- Co-authored-by: Nicolas Patry <patry.nicolas@gmail.com>
This commit is contained in:
@@ -3895,6 +3895,51 @@ class TokenizerTesterMixin:
|
||||
# Should not raise an error
|
||||
self.rust_tokenizer_class.from_pretrained(tmp_dir_2)
|
||||
|
||||
def test_clean_up_tokenization_spaces(self):
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
||||
assert tokenizer.clean_up_tokenization_spaces is True
|
||||
|
||||
tokens = tokenizer.encode("This shouldn't be! He'll go.")
|
||||
decoded = tokenizer.decode(tokens)
|
||||
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
|
||||
|
||||
tokenizer.clean_up_tokenization_spaces = False
|
||||
decoded = tokenizer.decode(tokens)
|
||||
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
|
||||
assert decoded == tokenizer.decode(tokens, clean_up_tokenization_spaces=False)
|
||||
|
||||
# Fast from slow
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_2:
|
||||
tokenizer.save_pretrained(tmp_dir_2)
|
||||
tokenizer_fast = BertTokenizerFast.from_pretrained(tmp_dir_2)
|
||||
del tokenizer
|
||||
|
||||
assert tokenizer_fast.clean_up_tokenization_spaces is False
|
||||
decoded = tokenizer_fast.decode(tokens)
|
||||
# fast and slow don't have the same output when we don't cleanup
|
||||
# tokenization space. Here `be!` vs `be !` and `go.` vs `go .`
|
||||
assert decoded == "[CLS] this shouldn ' t be! he ' ll go. [SEP]"
|
||||
|
||||
tokenizer_fast.clean_up_tokenization_spaces = True
|
||||
assert tokenizer_fast.clean_up_tokenization_spaces is True
|
||||
|
||||
decoded = tokenizer_fast.decode(tokens)
|
||||
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
|
||||
|
||||
# Slow from fast
|
||||
with tempfile.TemporaryDirectory() as tmp_dir_2:
|
||||
tokenizer_fast.clean_up_tokenization_spaces = False
|
||||
tokenizer_fast.save_pretrained(tmp_dir_2)
|
||||
tokenizer = BertTokenizer.from_pretrained(tmp_dir_2)
|
||||
|
||||
assert tokenizer_fast.clean_up_tokenization_spaces is False
|
||||
decoded = tokenizer.decode(tokens)
|
||||
assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"
|
||||
|
||||
tokenizer.clean_up_tokenization_spaces = True
|
||||
decoded = tokenizer.decode(tokens)
|
||||
assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]"
|
||||
|
||||
|
||||
class TokenizerUtilTester(unittest.TestCase):
|
||||
def test_cached_files_are_used_when_internet_is_down(self):
|
||||
|
||||
Reference in New Issue
Block a user