diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py index aecc37223f..f3b9efa9be 100644 --- a/examples/research_projects/codeparrot/scripts/preprocessing.py +++ b/examples/research_projects/codeparrot/scripts/preprocessing.py @@ -1,5 +1,4 @@ import gzip -import hashlib import json import multiprocessing import os @@ -11,6 +10,7 @@ from pathlib import Path import numpy as np from arguments import PreprocessingArguments from datasets import load_dataset +from huggingface_hub.utils import insecure_hashlib from minhash_deduplication import deduplicate_dataset from transformers import AutoTokenizer, HfArgumentParser @@ -21,7 +21,7 @@ PATTERN = re.compile(r"\s+") def get_hash(example): """Get hash of content field.""" - return {"hash": hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()} + return {"hash": insecure_hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()} def line_stats(example): diff --git a/examples/research_projects/lxmert/utils.py b/examples/research_projects/lxmert/utils.py index 2fc6ea2062..c75f523a08 100644 --- a/examples/research_projects/lxmert/utils.py +++ b/examples/research_projects/lxmert/utils.py @@ -28,7 +28,6 @@ import tempfile from collections import OrderedDict from contextlib import contextmanager from functools import partial -from hashlib import sha256 from io import BytesIO from pathlib import Path from urllib.parse import urlparse @@ -39,6 +38,7 @@ import numpy as np import requests import wget from filelock import FileLock +from huggingface_hub.utils import insecure_hashlib from PIL import Image from tqdm.auto import tqdm from yaml import Loader, dump, load @@ -402,12 +402,12 @@ def get_from_cache( def url_to_filename(url, etag=None): url_bytes = url.encode("utf-8") - url_hash = sha256(url_bytes) + url_hash = insecure_hashlib.sha256(url_bytes) filename = url_hash.hexdigest() if etag: etag_bytes = etag.encode("utf-8") - etag_hash = sha256(etag_bytes) + etag_hash = insecure_hashlib.sha256(etag_bytes) filename += "." + etag_hash.hexdigest() if url.endswith(".h5"): diff --git a/examples/research_projects/visual_bert/utils.py b/examples/research_projects/visual_bert/utils.py index 2fc6ea2062..c75f523a08 100644 --- a/examples/research_projects/visual_bert/utils.py +++ b/examples/research_projects/visual_bert/utils.py @@ -28,7 +28,6 @@ import tempfile from collections import OrderedDict from contextlib import contextmanager from functools import partial -from hashlib import sha256 from io import BytesIO from pathlib import Path from urllib.parse import urlparse @@ -39,6 +38,7 @@ import numpy as np import requests import wget from filelock import FileLock +from huggingface_hub.utils import insecure_hashlib from PIL import Image from tqdm.auto import tqdm from yaml import Loader, dump, load @@ -402,12 +402,12 @@ def get_from_cache( def url_to_filename(url, etag=None): url_bytes = url.encode("utf-8") - url_hash = sha256(url_bytes) + url_hash = insecure_hashlib.sha256(url_bytes) filename = url_hash.hexdigest() if etag: etag_bytes = etag.encode("utf-8") - etag_hash = sha256(etag_bytes) + etag_hash = insecure_hashlib.sha256(etag_bytes) filename += "." + etag_hash.hexdigest() if url.endswith(".h5"): diff --git a/setup.py b/setup.py index 764411dab3..86c8a8a5bf 100644 --- a/setup.py +++ b/setup.py @@ -118,7 +118,7 @@ _deps = [ "fugashi>=1.0", "GitPython<3.1.19", "hf-doc-builder>=0.3.0", - "huggingface-hub>=0.16.4,<1.0", + "huggingface-hub>=0.19.3,<1.0", "importlib_metadata", "ipadic>=1.0.0,<2.0", "isort>=5.5.4", @@ -321,6 +321,7 @@ extras["testing"] = ( "rjieba", "beautifulsoup4", "tensorboard", + "pydantic", ) + extras["retrieval"] + extras["modelcreation"] diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index d460790c8e..30e902d698 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -25,7 +25,7 @@ deps = { "fugashi": "fugashi>=1.0", "GitPython": "GitPython<3.1.19", "hf-doc-builder": "hf-doc-builder>=0.3.0", - "huggingface-hub": "huggingface-hub>=0.16.4,<1.0", + "huggingface-hub": "huggingface-hub>=0.19.3,<1.0", "importlib_metadata": "importlib_metadata", "ipadic": "ipadic>=1.0.0,<2.0", "isort": "isort>=5.5.4", diff --git a/src/transformers/models/whisper/convert_openai_to_hf.py b/src/transformers/models/whisper/convert_openai_to_hf.py index 0db555c4aa..0d6cdaa958 100755 --- a/src/transformers/models/whisper/convert_openai_to_hf.py +++ b/src/transformers/models/whisper/convert_openai_to_hf.py @@ -15,7 +15,6 @@ # limitations under the License. import argparse -import hashlib import io import json import os @@ -24,6 +23,7 @@ import urllib import warnings import torch +from huggingface_hub.utils import insecure_hashlib from torch import nn from tqdm import tqdm @@ -114,7 +114,7 @@ def _download(url: str, root: str) -> io.BytesIO: if os.path.isfile(download_target): model_bytes = open(download_target, "rb").read() - if hashlib.sha256(model_bytes).hexdigest() == expected_sha256: + if insecure_hashlib.sha256(model_bytes).hexdigest() == expected_sha256: return torch.load(io.BytesIO(model_bytes)) else: warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file") @@ -132,7 +132,7 @@ def _download(url: str, root: str) -> io.BytesIO: loop.update(len(buffer)) model_bytes = open(download_target, "rb").read() - if hashlib.sha256(model_bytes).hexdigest() != expected_sha256: + if insecure_hashlib.sha256(model_bytes).hexdigest() != expected_sha256: raise RuntimeError( "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model." ) diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py index 054574b4fd..009aa1c942 100644 --- a/tests/pipelines/test_pipelines_depth_estimation.py +++ b/tests/pipelines/test_pipelines_depth_estimation.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import hashlib import unittest +from huggingface_hub.utils import insecure_hashlib + from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available from transformers.pipelines import DepthEstimationPipeline, pipeline from transformers.testing_utils import ( @@ -44,7 +45,7 @@ else: def hashimage(image: Image) -> str: - m = hashlib.md5(image.tobytes()) + m = insecure_hashlib.md5(image.tobytes()) return m.hexdigest() diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py index dbc0c0db80..9c5c8fdfd4 100644 --- a/tests/pipelines/test_pipelines_image_segmentation.py +++ b/tests/pipelines/test_pipelines_image_segmentation.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import hashlib import tempfile import unittest from typing import Dict @@ -21,6 +20,7 @@ import datasets import numpy as np import requests from datasets import load_dataset +from huggingface_hub.utils import insecure_hashlib from transformers import ( MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, @@ -59,7 +59,7 @@ else: def hashimage(image: Image) -> str: - m = hashlib.md5(image.tobytes()) + m = insecure_hashlib.md5(image.tobytes()) return m.hexdigest()[:10] diff --git a/tests/pipelines/test_pipelines_mask_generation.py b/tests/pipelines/test_pipelines_mask_generation.py index cf17039065..c9a44a5354 100644 --- a/tests/pipelines/test_pipelines_mask_generation.py +++ b/tests/pipelines/test_pipelines_mask_generation.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import hashlib import unittest from typing import Dict import numpy as np +from huggingface_hub.utils import insecure_hashlib from transformers import ( MODEL_FOR_MASK_GENERATION_MAPPING, @@ -46,7 +46,7 @@ else: def hashimage(image: Image) -> str: - m = hashlib.md5(image.tobytes()) + m = insecure_hashlib.md5(image.tobytes()) return m.hexdigest()[:10]