Fix llama tokenizer (#22402)

* draft * update tokenization limma and conversion script * more udpates * initial commit * style * default pad to None * draft tokenization tests * update test * update tokenization tests * nits * update * versioning test * major fix * fix more testst * finish fixing special masks * last nit * more nits * add encode decode tests * add more * fix token type ids * style
2023-04-03 15:07:32 +02:00
parent 9eae4aa576
commit c0f99b4d2e
4 changed files with 480 additions and 62 deletions
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -3932,7 +3932,7 @@ class TokenizerTesterMixin:
            tokenizer_fast.save_pretrained(tmp_dir_2)
            tokenizer = BertTokenizer.from_pretrained(tmp_dir_2)

-        assert tokenizer_fast.clean_up_tokenization_spaces is False
+        assert tokenizer.clean_up_tokenization_spaces is False
        decoded = tokenizer.decode(tokens)
        assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]"