From 286a18fa0080dd39bd373008d11d831fbb1a77f1 Mon Sep 17 00:00:00 2001 From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com> Date: Thu, 28 Jul 2022 15:58:26 +0200 Subject: [PATCH] Fix codeparrot deduplication - ignore whitespaces (#18023) * ignore whitspaces for hash * reformat code * Update README.md --- examples/research_projects/codeparrot/README.md | 2 +- .../research_projects/codeparrot/scripts/preprocessing.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md index 4e70381a74..ef92606c54 100644 --- a/examples/research_projects/codeparrot/README.md +++ b/examples/research_projects/codeparrot/README.md @@ -39,7 +39,7 @@ The source of the dataset is the GitHub dump available on Google's [BigQuery](ht ### Preprocessing The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374) and some new ones: -- exact deduplication using each file's hash +- exact deduplication using each file's hash after having removed whistespaces. - near deduplication using MinHash and Jaccard similarity. MinHash with a Jaccard threshold (default=0.85) is first used to create duplicate clusters. Then these clusters are then reduced to unique files based on the exact Jaccard similarity. See `deduplicate_dataset` in `minhash_deduplication.py` for a detailed description. - filtering files with max line length > 1000 - filtering files with mean line length > 100 diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py index 3d4ec40dec..6236a8aad8 100644 --- a/examples/research_projects/codeparrot/scripts/preprocessing.py +++ b/examples/research_projects/codeparrot/scripts/preprocessing.py @@ -3,6 +3,7 @@ import hashlib import json import multiprocessing import os +import re import shutil import time from pathlib import Path @@ -15,9 +16,12 @@ from minhash_deduplication import deduplicate_dataset from transformers import AutoTokenizer, HfArgumentParser +PATTERN = re.compile(r"\s+") + + def get_hash(example): """Get hash of content field.""" - return {"hash": hashlib.md5(example["content"].strip().encode("utf-8")).hexdigest()} + return {"hash": hashlib.md5(re.sub(PATTERN, "", example["content"]).encode("utf-8")).hexdigest()} def line_stats(example):