Update codeparrot data preprocessing (#16944)
* add new preprocessing arguments * add new filters * add new filters to readme * fix config and test count, update function names and docstrings * reformat code * update readme * Update readme * rename config_test filter Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com> * rename few_assignments filter Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com> * rename tokenizer in arguments Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com> * rename functions and add limit_line argument for config_test filter * update threshold for config_test filter Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com> Co-authored-by: Loubna ben allal <loubnabenallal@gmail.com>
This commit is contained in:
@@ -37,20 +37,25 @@ Additionally, sure you have git-lfs installed. You can find instructions for how
|
|||||||
The source of the dataset is the GitHub dump available on Google's [BigQuery](https://cloud.google.com/blog/topics/public-datasets/github-on-bigquery-analyze-all-the-open-source-code). The database was queried for all Python files with less than 1MB in size resulting in a 180GB dataset with over 20M files. The dataset is available on the Hugging Face Hub [here](https://huggingface.co/datasets/transformersbook/codeparrot).
|
The source of the dataset is the GitHub dump available on Google's [BigQuery](https://cloud.google.com/blog/topics/public-datasets/github-on-bigquery-analyze-all-the-open-source-code). The database was queried for all Python files with less than 1MB in size resulting in a 180GB dataset with over 20M files. The dataset is available on the Hugging Face Hub [here](https://huggingface.co/datasets/transformersbook/codeparrot).
|
||||||
|
|
||||||
### Preprocessing
|
### Preprocessing
|
||||||
The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374):
|
The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374) and some new ones:
|
||||||
|
|
||||||
- exact deduplication using each file's hash
|
- exact deduplication using each file's hash
|
||||||
- filtering files with max line length > 1000
|
- filtering files with max line length > 1000
|
||||||
- filtering files with mean line length > 100
|
- filtering files with mean line length > 100
|
||||||
- fraction of alphanumeric characters < 0.25
|
- fraction of alphanumeric characters < 0.25
|
||||||
- containing the word "auto-generated" or similar in the first 5 lines
|
- containing the word "auto-generated" or similar in the first 5 lines
|
||||||
|
- filtering with a probability of 0.7 of files with a mention of "test file" or "configuration file" or similar in the first 5 lines
|
||||||
|
- filtering with a probability of 0.7 of files with high occurence of the keywords "test " or "config"
|
||||||
|
- filtering with a probability of 0.7 of files without a mention of the keywords `def` , `for`, `while` and `class`
|
||||||
|
- filtering files that use the assignment operator `=` less than 5 times
|
||||||
|
- filtering files with ratio between number of characters and number of tokens after tokenization < 1.5 (the average ratio is 3.6)
|
||||||
|
|
||||||
The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/lvwerra/codeparrot-clean-train) and [validation](https://huggingface.co/datasets/lvwerra/codeparrot-clean-valid) splits are also available on the Hub if you want to skip this step or use the data for another project.
|
The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-train-v2) and [validation](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-valid-v2) splits are also available on the Hub if you want to skip this step or use the data for another project.
|
||||||
|
|
||||||
To execute the preprocessing run the following command:
|
To execute the preprocessing run the following command:
|
||||||
```bash
|
```bash
|
||||||
python scripts/preprocessing.py \
|
python scripts/preprocessing.py \
|
||||||
--dataset_name lvwerra/codeparrot \
|
--dataset_name transformersbook/codeparrot \
|
||||||
--output_dir codeparrot-clean
|
--output_dir codeparrot-clean
|
||||||
```
|
```
|
||||||
During preprocessing the dataset is downloaded and stored locally as well as caches of the computations. Make sure you have more than 500GB free disk space to execute it.
|
During preprocessing the dataset is downloaded and stored locally as well as caches of the computations. Make sure you have more than 500GB free disk space to execute it.
|
||||||
|
|||||||
@@ -133,7 +133,7 @@ class PreprocessingArguments:
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
dataset_name: Optional[str] = field(
|
dataset_name: Optional[str] = field(
|
||||||
default="codeparrot", metadata={"help": "Folder or name of dataset to process."}
|
default="transformersbook/codeparrot", metadata={"help": "Folder or name of dataset to process."}
|
||||||
)
|
)
|
||||||
output_dir: Optional[str] = field(
|
output_dir: Optional[str] = field(
|
||||||
default="codeparrot-clean", metadata={"help": "Folder to save processed processed dataset."}
|
default="codeparrot-clean", metadata={"help": "Folder to save processed processed dataset."}
|
||||||
@@ -151,6 +151,16 @@ class PreprocessingArguments:
|
|||||||
alpha_frac: Optional[float] = field(
|
alpha_frac: Optional[float] = field(
|
||||||
default=0.25, metadata={"help": "Maximum fraction of non-alphanumeric characters, otherwise file is filtered."}
|
default=0.25, metadata={"help": "Maximum fraction of non-alphanumeric characters, otherwise file is filtered."}
|
||||||
)
|
)
|
||||||
|
min_token_ratio: Optional[float] = field(
|
||||||
|
default=1.5, metadata={"help": "Minimum character token ratio for the file, otherwise file is filtered."}
|
||||||
|
)
|
||||||
|
filter_proba: Optional[float] = field(
|
||||||
|
default=0.7, metadata={"help": "Probability for filtering config, test and uncommon files."}
|
||||||
|
)
|
||||||
|
tokenizer: Optional[str] = field(
|
||||||
|
default="lvwerra/codeparrot",
|
||||||
|
metadata={"help": "Name or path to the tokenizer."},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ import numpy as np
|
|||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
|
||||||
from arguments import PreprocessingArguments
|
from arguments import PreprocessingArguments
|
||||||
from transformers import HfArgumentParser
|
from transformers import AutoTokenizer, HfArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def get_hash(example):
|
def get_hash(example):
|
||||||
@@ -50,18 +50,77 @@ def is_autogenerated(example, scan_width=5):
|
|||||||
return {"autogenerated": False}
|
return {"autogenerated": False}
|
||||||
|
|
||||||
|
|
||||||
|
def is_config_or_test(example, scan_width=5, coeff=0.05):
|
||||||
|
"""Check if file is a configuration file or a unit test by :
|
||||||
|
1- looking for keywords in the first few lines of the file.
|
||||||
|
2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
|
||||||
|
"""
|
||||||
|
|
||||||
|
keywords = ["unit tests", "test file", "configuration file"]
|
||||||
|
lines = example["content"].splitlines()
|
||||||
|
count_config = 0
|
||||||
|
count_test = 0
|
||||||
|
# first test
|
||||||
|
for _, line in zip(range(scan_width), lines):
|
||||||
|
for keyword in keywords:
|
||||||
|
if keyword in line.lower():
|
||||||
|
return {"config_or_test": True}
|
||||||
|
# second test
|
||||||
|
nlines = example["content"].count("\n")
|
||||||
|
threshold = int(coeff * nlines)
|
||||||
|
for line in lines:
|
||||||
|
count_config += line.lower().count("config")
|
||||||
|
count_test += line.lower().count("test")
|
||||||
|
if count_config > threshold or count_test > threshold:
|
||||||
|
return {"config_or_test": True}
|
||||||
|
return {"config_or_test": False}
|
||||||
|
|
||||||
|
|
||||||
|
def has_no_keywords(example):
|
||||||
|
"""Check if a python file has none of the keywords for: funcion, class, for loop, while loop."""
|
||||||
|
keywords = ["def ", "class ", "for ", "while "]
|
||||||
|
lines = example["content"].splitlines()
|
||||||
|
for line in lines:
|
||||||
|
for keyword in keywords:
|
||||||
|
if keyword in line.lower():
|
||||||
|
return {"has_no_keywords": False}
|
||||||
|
return {"has_no_keywords": True}
|
||||||
|
|
||||||
|
|
||||||
|
def has_few_assignments(example, minimum=4):
|
||||||
|
"""Check if file uses symbol '=' less than `minimum` times."""
|
||||||
|
lines = example["content"].splitlines()
|
||||||
|
counter = 0
|
||||||
|
for line in lines:
|
||||||
|
counter += line.lower().count("=")
|
||||||
|
if counter > minimum:
|
||||||
|
return {"has_few_assignments": False}
|
||||||
|
return {"has_few_assignments": True}
|
||||||
|
|
||||||
|
|
||||||
|
def char_token_ratio(example):
|
||||||
|
"""Compute character/token ratio of the file with tokenizer."""
|
||||||
|
input_ids = tokenizer(example["content"], truncation=False)["input_ids"]
|
||||||
|
ratio = len(example["content"]) / len(input_ids)
|
||||||
|
return {"ratio": ratio}
|
||||||
|
|
||||||
|
|
||||||
def preprocess(example):
|
def preprocess(example):
|
||||||
"""Chain all preprocessing steps into one function to not fill cache."""
|
"""Chain all preprocessing steps into one function to not fill cache."""
|
||||||
results = dict()
|
results = dict()
|
||||||
results.update(get_hash(example))
|
results.update(get_hash(example))
|
||||||
results.update(line_stats(example))
|
results.update(line_stats(example))
|
||||||
results.update(alpha_stats(example))
|
results.update(alpha_stats(example))
|
||||||
|
results.update(char_token_ratio(example))
|
||||||
results.update(is_autogenerated(example))
|
results.update(is_autogenerated(example))
|
||||||
|
results.update(is_config_or_test(example))
|
||||||
|
results.update(has_no_keywords(example))
|
||||||
|
results.update(has_few_assignments(example))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def filter(example, uniques, args):
|
def filter(example, uniques, args):
|
||||||
"""Filter dataset with heuristics."""
|
"""Filter dataset with heuristics. Config, test and has_no_keywords files are removed with a given probability."""
|
||||||
if not check_uniques(example, uniques):
|
if not check_uniques(example, uniques):
|
||||||
return False
|
return False
|
||||||
elif example["autogenerated"]:
|
elif example["autogenerated"]:
|
||||||
@@ -72,6 +131,14 @@ def filter(example, uniques, args):
|
|||||||
return False
|
return False
|
||||||
elif example["alpha_frac"] < args.alpha_frac:
|
elif example["alpha_frac"] < args.alpha_frac:
|
||||||
return False
|
return False
|
||||||
|
elif example["ratio"] < args.min_token_ratio:
|
||||||
|
return False
|
||||||
|
elif example["config_or_test"] and np.random.rand() <= args.filter_proba:
|
||||||
|
return False
|
||||||
|
elif example["has_no_keywords"] and np.random.rand() <= args.filter_proba:
|
||||||
|
return False
|
||||||
|
elif example["has_few_assignments"]:
|
||||||
|
return False
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -89,6 +156,7 @@ parser = HfArgumentParser(PreprocessingArguments)
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.num_workers is None:
|
if args.num_workers is None:
|
||||||
args.num_workers = multiprocessing.cpu_count()
|
args.num_workers = multiprocessing.cpu_count()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
|
||||||
|
|
||||||
# Load dataset
|
# Load dataset
|
||||||
t_start = time.time()
|
t_start = time.time()
|
||||||
|
|||||||
Reference in New Issue
Block a user