Fix hashing for deduplication (#17048)

This commit is contained in:
Thomas Wang
2022-05-04 08:40:24 +02:00
committed by GitHub
parent 39f8eafc1b
commit db034660fb

View File

@@ -1,4 +1,5 @@
import gzip
import hashlib
import multiprocessing
import os
import shutil
@@ -13,7 +14,7 @@ from transformers import HfArgumentParser
def get_hash(example):
"""Get hash of content field."""
return {"hash": hash(example["content"])}
return {"hash": hashlib.md5(example["content"].strip().encode("utf-8")).hexdigest()}
def line_stats(example):