Set usedforsecurity=False in hashlib methods (FIPS compliance) (#27483)

* Set usedforsecurity=False in hashlib methods (FIPS compliance)

* trigger ci

* tokenizers version

* deps

* bump hfh version

* let's try this
This commit is contained in:
Lucain
2023-11-16 15:29:53 +01:00
committed by GitHub
parent 5603fad247
commit fd65aa9818
9 changed files with 21 additions and 19 deletions

View File

@@ -1,5 +1,4 @@
import gzip
import hashlib
import json
import multiprocessing
import os
@@ -11,6 +10,7 @@ from pathlib import Path
import numpy as np
from arguments import PreprocessingArguments
from datasets import load_dataset
from huggingface_hub.utils import insecure_hashlib
from minhash_deduplication import deduplicate_dataset
from transformers import AutoTokenizer, HfArgumentParser
@@ -21,7 +21,7 @@ PATTERN = re.compile(r"\s+")
def get_hash(example):
"""Get hash of content field."""
return {"hash": hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()}
return {"hash": insecure_hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()}
def line_stats(example):