Update codeparrot data preprocessing (#16944)
* add new preprocessing arguments * add new filters * add new filters to readme * fix config and test count, update function names and docstrings * reformat code * update readme * Update readme * rename config_test filter Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com> * rename few_assignments filter Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com> * rename tokenizer in arguments Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com> * rename functions and add limit_line argument for config_test filter * update threshold for config_test filter Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com> Co-authored-by: Loubna ben allal <loubnabenallal@gmail.com>
This commit is contained in:
@@ -9,7 +9,7 @@ import numpy as np
|
||||
from datasets import load_dataset
|
||||
|
||||
from arguments import PreprocessingArguments
|
||||
from transformers import HfArgumentParser
|
||||
from transformers import AutoTokenizer, HfArgumentParser
|
||||
|
||||
|
||||
def get_hash(example):
|
||||
@@ -50,18 +50,77 @@ def is_autogenerated(example, scan_width=5):
|
||||
return {"autogenerated": False}
|
||||
|
||||
|
||||
def is_config_or_test(example, scan_width=5, coeff=0.05):
|
||||
"""Check if file is a configuration file or a unit test by :
|
||||
1- looking for keywords in the first few lines of the file.
|
||||
2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
|
||||
"""
|
||||
|
||||
keywords = ["unit tests", "test file", "configuration file"]
|
||||
lines = example["content"].splitlines()
|
||||
count_config = 0
|
||||
count_test = 0
|
||||
# first test
|
||||
for _, line in zip(range(scan_width), lines):
|
||||
for keyword in keywords:
|
||||
if keyword in line.lower():
|
||||
return {"config_or_test": True}
|
||||
# second test
|
||||
nlines = example["content"].count("\n")
|
||||
threshold = int(coeff * nlines)
|
||||
for line in lines:
|
||||
count_config += line.lower().count("config")
|
||||
count_test += line.lower().count("test")
|
||||
if count_config > threshold or count_test > threshold:
|
||||
return {"config_or_test": True}
|
||||
return {"config_or_test": False}
|
||||
|
||||
|
||||
def has_no_keywords(example):
|
||||
"""Check if a python file has none of the keywords for: funcion, class, for loop, while loop."""
|
||||
keywords = ["def ", "class ", "for ", "while "]
|
||||
lines = example["content"].splitlines()
|
||||
for line in lines:
|
||||
for keyword in keywords:
|
||||
if keyword in line.lower():
|
||||
return {"has_no_keywords": False}
|
||||
return {"has_no_keywords": True}
|
||||
|
||||
|
||||
def has_few_assignments(example, minimum=4):
|
||||
"""Check if file uses symbol '=' less than `minimum` times."""
|
||||
lines = example["content"].splitlines()
|
||||
counter = 0
|
||||
for line in lines:
|
||||
counter += line.lower().count("=")
|
||||
if counter > minimum:
|
||||
return {"has_few_assignments": False}
|
||||
return {"has_few_assignments": True}
|
||||
|
||||
|
||||
def char_token_ratio(example):
|
||||
"""Compute character/token ratio of the file with tokenizer."""
|
||||
input_ids = tokenizer(example["content"], truncation=False)["input_ids"]
|
||||
ratio = len(example["content"]) / len(input_ids)
|
||||
return {"ratio": ratio}
|
||||
|
||||
|
||||
def preprocess(example):
|
||||
"""Chain all preprocessing steps into one function to not fill cache."""
|
||||
results = dict()
|
||||
results.update(get_hash(example))
|
||||
results.update(line_stats(example))
|
||||
results.update(alpha_stats(example))
|
||||
results.update(char_token_ratio(example))
|
||||
results.update(is_autogenerated(example))
|
||||
results.update(is_config_or_test(example))
|
||||
results.update(has_no_keywords(example))
|
||||
results.update(has_few_assignments(example))
|
||||
return results
|
||||
|
||||
|
||||
def filter(example, uniques, args):
|
||||
"""Filter dataset with heuristics."""
|
||||
"""Filter dataset with heuristics. Config, test and has_no_keywords files are removed with a given probability."""
|
||||
if not check_uniques(example, uniques):
|
||||
return False
|
||||
elif example["autogenerated"]:
|
||||
@@ -72,6 +131,14 @@ def filter(example, uniques, args):
|
||||
return False
|
||||
elif example["alpha_frac"] < args.alpha_frac:
|
||||
return False
|
||||
elif example["ratio"] < args.min_token_ratio:
|
||||
return False
|
||||
elif example["config_or_test"] and np.random.rand() <= args.filter_proba:
|
||||
return False
|
||||
elif example["has_no_keywords"] and np.random.rand() <= args.filter_proba:
|
||||
return False
|
||||
elif example["has_few_assignments"]:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
@@ -89,6 +156,7 @@ parser = HfArgumentParser(PreprocessingArguments)
|
||||
args = parser.parse_args()
|
||||
if args.num_workers is None:
|
||||
args.num_workers = multiprocessing.cpu_count()
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
|
||||
|
||||
# Load dataset
|
||||
t_start = time.time()
|
||||
|
||||
Reference in New Issue
Block a user