Set usedforsecurity=False in hashlib methods (FIPS compliance) (#27483)
* Set usedforsecurity=False in hashlib methods (FIPS compliance) * trigger ci * tokenizers version * deps * bump hfh version * let's try this
This commit is contained in:
@@ -1,5 +1,4 @@
|
|||||||
import gzip
|
import gzip
|
||||||
import hashlib
|
|
||||||
import json
|
import json
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
@@ -11,6 +10,7 @@ from pathlib import Path
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from arguments import PreprocessingArguments
|
from arguments import PreprocessingArguments
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
from huggingface_hub.utils import insecure_hashlib
|
||||||
from minhash_deduplication import deduplicate_dataset
|
from minhash_deduplication import deduplicate_dataset
|
||||||
|
|
||||||
from transformers import AutoTokenizer, HfArgumentParser
|
from transformers import AutoTokenizer, HfArgumentParser
|
||||||
@@ -21,7 +21,7 @@ PATTERN = re.compile(r"\s+")
|
|||||||
|
|
||||||
def get_hash(example):
|
def get_hash(example):
|
||||||
"""Get hash of content field."""
|
"""Get hash of content field."""
|
||||||
return {"hash": hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()}
|
return {"hash": insecure_hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()}
|
||||||
|
|
||||||
|
|
||||||
def line_stats(example):
|
def line_stats(example):
|
||||||
|
|||||||
@@ -28,7 +28,6 @@ import tempfile
|
|||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from hashlib import sha256
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
@@ -39,6 +38,7 @@ import numpy as np
|
|||||||
import requests
|
import requests
|
||||||
import wget
|
import wget
|
||||||
from filelock import FileLock
|
from filelock import FileLock
|
||||||
|
from huggingface_hub.utils import insecure_hashlib
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
from yaml import Loader, dump, load
|
from yaml import Loader, dump, load
|
||||||
@@ -402,12 +402,12 @@ def get_from_cache(
|
|||||||
|
|
||||||
def url_to_filename(url, etag=None):
|
def url_to_filename(url, etag=None):
|
||||||
url_bytes = url.encode("utf-8")
|
url_bytes = url.encode("utf-8")
|
||||||
url_hash = sha256(url_bytes)
|
url_hash = insecure_hashlib.sha256(url_bytes)
|
||||||
filename = url_hash.hexdigest()
|
filename = url_hash.hexdigest()
|
||||||
|
|
||||||
if etag:
|
if etag:
|
||||||
etag_bytes = etag.encode("utf-8")
|
etag_bytes = etag.encode("utf-8")
|
||||||
etag_hash = sha256(etag_bytes)
|
etag_hash = insecure_hashlib.sha256(etag_bytes)
|
||||||
filename += "." + etag_hash.hexdigest()
|
filename += "." + etag_hash.hexdigest()
|
||||||
|
|
||||||
if url.endswith(".h5"):
|
if url.endswith(".h5"):
|
||||||
|
|||||||
@@ -28,7 +28,6 @@ import tempfile
|
|||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from hashlib import sha256
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
@@ -39,6 +38,7 @@ import numpy as np
|
|||||||
import requests
|
import requests
|
||||||
import wget
|
import wget
|
||||||
from filelock import FileLock
|
from filelock import FileLock
|
||||||
|
from huggingface_hub.utils import insecure_hashlib
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
from yaml import Loader, dump, load
|
from yaml import Loader, dump, load
|
||||||
@@ -402,12 +402,12 @@ def get_from_cache(
|
|||||||
|
|
||||||
def url_to_filename(url, etag=None):
|
def url_to_filename(url, etag=None):
|
||||||
url_bytes = url.encode("utf-8")
|
url_bytes = url.encode("utf-8")
|
||||||
url_hash = sha256(url_bytes)
|
url_hash = insecure_hashlib.sha256(url_bytes)
|
||||||
filename = url_hash.hexdigest()
|
filename = url_hash.hexdigest()
|
||||||
|
|
||||||
if etag:
|
if etag:
|
||||||
etag_bytes = etag.encode("utf-8")
|
etag_bytes = etag.encode("utf-8")
|
||||||
etag_hash = sha256(etag_bytes)
|
etag_hash = insecure_hashlib.sha256(etag_bytes)
|
||||||
filename += "." + etag_hash.hexdigest()
|
filename += "." + etag_hash.hexdigest()
|
||||||
|
|
||||||
if url.endswith(".h5"):
|
if url.endswith(".h5"):
|
||||||
|
|||||||
3
setup.py
3
setup.py
@@ -118,7 +118,7 @@ _deps = [
|
|||||||
"fugashi>=1.0",
|
"fugashi>=1.0",
|
||||||
"GitPython<3.1.19",
|
"GitPython<3.1.19",
|
||||||
"hf-doc-builder>=0.3.0",
|
"hf-doc-builder>=0.3.0",
|
||||||
"huggingface-hub>=0.16.4,<1.0",
|
"huggingface-hub>=0.19.3,<1.0",
|
||||||
"importlib_metadata",
|
"importlib_metadata",
|
||||||
"ipadic>=1.0.0,<2.0",
|
"ipadic>=1.0.0,<2.0",
|
||||||
"isort>=5.5.4",
|
"isort>=5.5.4",
|
||||||
@@ -321,6 +321,7 @@ extras["testing"] = (
|
|||||||
"rjieba",
|
"rjieba",
|
||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
"tensorboard",
|
"tensorboard",
|
||||||
|
"pydantic",
|
||||||
)
|
)
|
||||||
+ extras["retrieval"]
|
+ extras["retrieval"]
|
||||||
+ extras["modelcreation"]
|
+ extras["modelcreation"]
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ deps = {
|
|||||||
"fugashi": "fugashi>=1.0",
|
"fugashi": "fugashi>=1.0",
|
||||||
"GitPython": "GitPython<3.1.19",
|
"GitPython": "GitPython<3.1.19",
|
||||||
"hf-doc-builder": "hf-doc-builder>=0.3.0",
|
"hf-doc-builder": "hf-doc-builder>=0.3.0",
|
||||||
"huggingface-hub": "huggingface-hub>=0.16.4,<1.0",
|
"huggingface-hub": "huggingface-hub>=0.19.3,<1.0",
|
||||||
"importlib_metadata": "importlib_metadata",
|
"importlib_metadata": "importlib_metadata",
|
||||||
"ipadic": "ipadic>=1.0.0,<2.0",
|
"ipadic": "ipadic>=1.0.0,<2.0",
|
||||||
"isort": "isort>=5.5.4",
|
"isort": "isort>=5.5.4",
|
||||||
|
|||||||
@@ -15,7 +15,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import hashlib
|
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
@@ -24,6 +23,7 @@ import urllib
|
|||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
from huggingface_hub.utils import insecure_hashlib
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
@@ -114,7 +114,7 @@ def _download(url: str, root: str) -> io.BytesIO:
|
|||||||
|
|
||||||
if os.path.isfile(download_target):
|
if os.path.isfile(download_target):
|
||||||
model_bytes = open(download_target, "rb").read()
|
model_bytes = open(download_target, "rb").read()
|
||||||
if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
|
if insecure_hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
|
||||||
return torch.load(io.BytesIO(model_bytes))
|
return torch.load(io.BytesIO(model_bytes))
|
||||||
else:
|
else:
|
||||||
warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
|
warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
|
||||||
@@ -132,7 +132,7 @@ def _download(url: str, root: str) -> io.BytesIO:
|
|||||||
loop.update(len(buffer))
|
loop.update(len(buffer))
|
||||||
|
|
||||||
model_bytes = open(download_target, "rb").read()
|
model_bytes = open(download_target, "rb").read()
|
||||||
if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
|
if insecure_hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
|
"Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -12,9 +12,10 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import hashlib
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
|
from huggingface_hub.utils import insecure_hashlib
|
||||||
|
|
||||||
from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available
|
from transformers import MODEL_FOR_DEPTH_ESTIMATION_MAPPING, is_torch_available, is_vision_available
|
||||||
from transformers.pipelines import DepthEstimationPipeline, pipeline
|
from transformers.pipelines import DepthEstimationPipeline, pipeline
|
||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import (
|
||||||
@@ -44,7 +45,7 @@ else:
|
|||||||
|
|
||||||
|
|
||||||
def hashimage(image: Image) -> str:
|
def hashimage(image: Image) -> str:
|
||||||
m = hashlib.md5(image.tobytes())
|
m = insecure_hashlib.md5(image.tobytes())
|
||||||
return m.hexdigest()
|
return m.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import hashlib
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
@@ -21,6 +20,7 @@ import datasets
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import requests
|
import requests
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
from huggingface_hub.utils import insecure_hashlib
|
||||||
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
|
||||||
@@ -59,7 +59,7 @@ else:
|
|||||||
|
|
||||||
|
|
||||||
def hashimage(image: Image) -> str:
|
def hashimage(image: Image) -> str:
|
||||||
m = hashlib.md5(image.tobytes())
|
m = insecure_hashlib.md5(image.tobytes())
|
||||||
return m.hexdigest()[:10]
|
return m.hexdigest()[:10]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -12,11 +12,11 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import hashlib
|
|
||||||
import unittest
|
import unittest
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from huggingface_hub.utils import insecure_hashlib
|
||||||
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
MODEL_FOR_MASK_GENERATION_MAPPING,
|
MODEL_FOR_MASK_GENERATION_MAPPING,
|
||||||
@@ -46,7 +46,7 @@ else:
|
|||||||
|
|
||||||
|
|
||||||
def hashimage(image: Image) -> str:
|
def hashimage(image: Image) -> str:
|
||||||
m = hashlib.md5(image.tobytes())
|
m = insecure_hashlib.md5(image.tobytes())
|
||||||
return m.hexdigest()[:10]
|
return m.hexdigest()[:10]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user