Fix hashing for deduplication (#17048)
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
import gzip
|
import gzip
|
||||||
|
import hashlib
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
@@ -13,7 +14,7 @@ from transformers import HfArgumentParser
|
|||||||
|
|
||||||
def get_hash(example):
|
def get_hash(example):
|
||||||
"""Get hash of content field."""
|
"""Get hash of content field."""
|
||||||
return {"hash": hash(example["content"])}
|
return {"hash": hashlib.md5(example["content"].strip().encode("utf-8")).hexdigest()}
|
||||||
|
|
||||||
|
|
||||||
def line_stats(example):
|
def line_stats(example):
|
||||||
|
|||||||
Reference in New Issue
Block a user