support loading model without config.json file (#32356)

* support loading model without config.json file * fix condition * update tests * add test * ruff * ruff * ruff
2024-09-06 07:49:47 -04:00
parent e1c2b69c34
commit 363301f221
7 changed files with 27 additions and 13 deletions
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -20,6 +20,7 @@ import tempfile
 import unittest

 from datasets import load_dataset
+from huggingface_hub import hf_hub_download

 from transformers import (
    SPIECE_UNDERLINE,
@@ -330,6 +331,15 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
            fast_.decode(EXPECTED_WITH_SPACE, skip_special_tokens=True),
        )

+    def test_load_tokenizer_with_model_file_only(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            hf_hub_download(repo_id="huggyllama/llama-7b", filename="tokenizer.model", local_dir=tmp_dir)
+            tokenizer_fast = self.rust_tokenizer_class.from_pretrained(tmp_dir)
+            self.assertEqual(tokenizer_fast.encode("This is a test"), [1, 910, 338, 263, 1243])
+
+            tokenizer_slow = self.tokenizer_class.from_pretrained(tmp_dir)
+            self.assertEqual(tokenizer_slow.encode("This is a test"), [1, 910, 338, 263, 1243])
+

@require_torch
@require_sentencepiece