support loading model without config.json file (#32356)

* support loading model without config.json file

* fix condition

* update tests

* add test

* ruff

* ruff

* ruff
This commit is contained in:
Ita Zaporozhets
2024-09-06 07:49:47 -04:00
committed by GitHub
parent e1c2b69c34
commit 363301f221
7 changed files with 27 additions and 13 deletions

View File

@@ -104,13 +104,6 @@ class AutoConfigTest(unittest.TestCase):
):
_ = AutoConfig.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER, revision="aaaaaa")
def test_configuration_not_found(self):
with self.assertRaisesRegex(
EnvironmentError,
"hf-internal-testing/no-config-test-repo does not appear to have a file named config.json.",
):
_ = AutoConfig.from_pretrained("hf-internal-testing/no-config-test-repo")
def test_from_pretrained_dynamic_config(self):
# If remote code is not set, we will time out when asking whether to load the model.
with self.assertRaises(ValueError):

View File

@@ -20,6 +20,7 @@ import tempfile
import unittest
from datasets import load_dataset
from huggingface_hub import hf_hub_download
from transformers import (
SPIECE_UNDERLINE,
@@ -330,6 +331,15 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
fast_.decode(EXPECTED_WITH_SPACE, skip_special_tokens=True),
)
def test_load_tokenizer_with_model_file_only(self):
with tempfile.TemporaryDirectory() as tmp_dir:
hf_hub_download(repo_id="huggyllama/llama-7b", filename="tokenizer.model", local_dir=tmp_dir)
tokenizer_fast = self.rust_tokenizer_class.from_pretrained(tmp_dir)
self.assertEqual(tokenizer_fast.encode("This is a test"), [1, 910, 338, 263, 1243])
tokenizer_slow = self.tokenizer_class.from_pretrained(tmp_dir)
self.assertEqual(tokenizer_slow.encode("This is a test"), [1, 910, 338, 263, 1243])
@require_torch
@require_sentencepiece

View File

@@ -247,12 +247,10 @@ class ConfigTestUtils(unittest.TestCase):
self.assertEqual(config.text_config.__class__.__name__, "CLIPTextConfig")
def test_from_pretrained_subfolder(self):
with self.assertRaises(OSError):
# config is in subfolder, the following should not work without specifying the subfolder
_ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder")
config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder")
self.assertIsNotNone(config)
config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert-subfolder", subfolder="bert")
self.assertIsNotNone(config)
def test_cached_files_are_used_when_internet_is_down(self):