Add versioning system to fast tokenizer files (#12713)
* Add versioning system to fast tokenizer files * Deal with offline mode * Use staging env in tests * Style * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Style Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
@@ -14,11 +14,13 @@
|
||||
# limitations under the License.
|
||||
|
||||
import concurrent.futures
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import PreTrainedTokenizerFast
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerFast
|
||||
from transformers.testing_utils import require_tokenizers
|
||||
|
||||
from .test_tokenization_common import TokenizerTesterMixin
|
||||
@@ -98,6 +100,51 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.tmpdirname = tmpdirname_orig
|
||||
|
||||
|
||||
@require_tokenizers
|
||||
class TokenizerVersioningTest(unittest.TestCase):
|
||||
def test_local_versioning(self):
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
json_tokenizer = json.loads(tokenizer._tokenizer.to_str())
|
||||
json_tokenizer["model"]["vocab"]["huggingface"] = len(tokenizer)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
tokenizer.save_pretrained(tmp_dir)
|
||||
json.dump(json_tokenizer, open(os.path.join(tmp_dir, "tokenizer.4.0.0.json"), "w"))
|
||||
|
||||
# This should pick the new tokenizer file as the version of Transformers is > 4.0.0
|
||||
new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
|
||||
self.assertEqual(len(new_tokenizer), len(tokenizer) + 1)
|
||||
json_tokenizer = json.loads(new_tokenizer._tokenizer.to_str())
|
||||
self.assertIn("huggingface", json_tokenizer["model"]["vocab"])
|
||||
|
||||
# Will need to be adjusted if we reach v42 and this test is still here.
|
||||
# Should pick the old tokenizer file as the version of Transformers is < 4.0.0
|
||||
shutil.move(os.path.join(tmp_dir, "tokenizer.4.0.0.json"), os.path.join(tmp_dir, "tokenizer.42.0.0.json"))
|
||||
new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
|
||||
self.assertEqual(len(new_tokenizer), len(tokenizer))
|
||||
json_tokenizer = json.loads(new_tokenizer._tokenizer.to_str())
|
||||
self.assertNotIn("huggingface", json_tokenizer["model"]["vocab"])
|
||||
|
||||
def test_repo_versioning(self):
|
||||
# This repo has two tokenizer files, one for v4.0.0 and above with an added token, one for versions lower.
|
||||
repo = "sgugger/finetuned-bert-mrpc"
|
||||
|
||||
# This should pick the new tokenizer file as the version of Transformers is > 4.0.0
|
||||
tokenizer = AutoTokenizer.from_pretrained(repo)
|
||||
self.assertEqual(len(tokenizer), 28997)
|
||||
json_tokenizer = json.loads(tokenizer._tokenizer.to_str())
|
||||
self.assertIn("huggingface", json_tokenizer["model"]["vocab"])
|
||||
|
||||
# Testing an older version by monkey-patching the version in the module it's used.
|
||||
import transformers as old_transformers
|
||||
|
||||
old_transformers.tokenization_utils_base.__version__ = "3.0.0"
|
||||
old_tokenizer = old_transformers.models.auto.AutoTokenizer.from_pretrained(repo)
|
||||
self.assertEqual(len(old_tokenizer), 28996)
|
||||
json_tokenizer = json.loads(old_tokenizer._tokenizer.to_str())
|
||||
self.assertNotIn("huggingface", json_tokenizer["model"]["vocab"])
|
||||
|
||||
|
||||
@require_tokenizers
|
||||
class ReduceMutableBorrowTests(unittest.TestCase):
|
||||
def test_async_share_tokenizer(self):
|
||||
|
||||
Reference in New Issue
Block a user