Make Transformers use cache files when hf.co is down (#16362)

* Make Transformers use cache files when hf.co is down * Fix tests * Was there a random circleCI failure? * Isolate patches * Style * Comment out the failure since it doesn't fail anymore * Better comment
2022-03-23 15:56:49 -04:00
parent 8a69e023bf
commit c595b6e6a9
13 changed files with 148 additions and 35 deletions
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -20,7 +20,7 @@ import shutil
 import sys
 import tempfile
 import unittest
-import unittest.mock
+import unittest.mock as mock
 from pathlib import Path

 from huggingface_hub import Repository, delete_repo, login
@@ -304,6 +304,22 @@ class ConfigTestUtils(unittest.TestCase):
                f"pick another value for them: {', '.join(keys_with_defaults)}."
            )

+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = []
+        response_mock.raise_for_status.side_effect = HTTPError
+
+        # Download this model to make sure it's in the cache.
+        _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head:
+            _ = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+

 class ConfigurationVersioningTest(unittest.TestCase):
    def test_local_versioning(self):
--- a/tests/test_feature_extraction_common.py
+++ b/tests/test_feature_extraction_common.py
@@ -19,6 +19,7 @@ import os
 import sys
 import tempfile
 import unittest
+import unittest.mock as mock
 from pathlib import Path

 from huggingface_hub import Repository, delete_repo, login
@@ -116,6 +117,23 @@ class FeatureExtractionSavingTestMixin:
        self.assertIsNotNone(feat_extract)


+class FeatureExtractorUtilTester(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = []
+        response_mock.raise_for_status.side_effect = HTTPError
+
+        # Download this model to make sure it's in the cache.
+        _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head:
+            _ = Wav2Vec2FeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+
@is_staging_test
 class FeatureExtractorPushToHubTester(unittest.TestCase):
    @classmethod
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -23,6 +23,7 @@ import random
 import sys
 import tempfile
 import unittest
+import unittest.mock as mock
 import warnings
 from pathlib import Path
 from typing import Dict, List, Tuple
@@ -2272,6 +2273,22 @@ class ModelUtilsTest(TestCasePlus):
        for p1, p2 in zip(model.parameters(), new_model.parameters()):
            self.assertTrue(torch.equal(p1, p2))

+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = []
+        response_mock.raise_for_status.side_effect = HTTPError
+
+        # Download this model to make sure it's in the cache.
+        _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head:
+            _ = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+

@require_torch
@is_staging_test
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -21,6 +21,7 @@ import os
 import random
 import tempfile
 import unittest
+import unittest.mock as mock
 from importlib import import_module
 from typing import List, Tuple

@@ -1555,6 +1556,22 @@ class UtilsFunctionsTest(unittest.TestCase):
        tf.debugging.assert_near(non_inf_output, non_inf_expected_output, rtol=1e-12)
        tf.debugging.assert_equal(non_inf_idx, non_inf_expected_idx)

+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = []
+        response_mock.raise_for_status.side_effect = HTTPError
+
+        # Download this model to make sure it's in the cache.
+        _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head:
+            _ = TFBertModel.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
    # tests whether the unpack_inputs function behaves as expected
    def test_unpack_inputs(self):
        class DummyModel:
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -24,6 +24,7 @@ import shutil
 import sys
 import tempfile
 import unittest
+import unittest.mock as mock
 from collections import OrderedDict
 from itertools import takewhile
 from pathlib import Path
@@ -3742,6 +3743,24 @@ class TokenizerTesterMixin:
                    self.rust_tokenizer_class.from_pretrained(tmp_dir_2)


+class TokenizerUtilTester(unittest.TestCase):
+    def test_cached_files_are_used_when_internet_is_down(self):
+        # A mock response for an HTTP head request to emulate server down
+        response_mock = mock.Mock()
+        response_mock.status_code = 500
+        response_mock.headers = []
+        response_mock.raise_for_status.side_effect = HTTPError
+
+        # Download this model to make sure it's in the cache.
+        _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+
+        # Under the mock environment we get a 500 error when trying to reach the model.
+        with mock.patch("transformers.utils.hub.requests.head", return_value=response_mock) as mock_head:
+            _ = BertTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+            # This check we did call the fake head request
+            mock_head.assert_called()
+
+
@is_staging_test
 class TokenizerPushToHubTester(unittest.TestCase):
    vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "bla", "blou"]
--- a/tests/utils/test_offline.py
+++ b/tests/utils/test_offline.py
@@ -59,10 +59,10 @@ socket.socket = offline_socket
        # next emulate no network
        cmd = [sys.executable, "-c", "\n".join([load, mock, run])]

-        # should normally fail as it will fail to lookup the model files w/o the network
-        env["TRANSFORMERS_OFFLINE"] = "0"
-        result = subprocess.run(cmd, env=env, check=False, capture_output=True)
-        self.assertEqual(result.returncode, 1, result.stderr)
+        # Doesn't fail anymore since the model is in the cache due to other tests, so commenting this.
+        # env["TRANSFORMERS_OFFLINE"] = "0"
+        # result = subprocess.run(cmd, env=env, check=False, capture_output=True)
+        # self.assertEqual(result.returncode, 1, result.stderr)

        # should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
        env["TRANSFORMERS_OFFLINE"] = "1"