Update codeparrot data preprocessing (#16944)

* add new preprocessing arguments * add new filters * add new filters to readme * fix config and test count, update function names and docstrings * reformat code * update readme * Update readme * rename config_test filter Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com> * rename few_assignments filter Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com> * rename tokenizer in arguments Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com> * rename functions and add limit_line argument for config_test filter * update threshold for config_test filter Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com> Co-authored-by: Loubna ben allal <loubnabenallal@gmail.com>
2022-05-16 14:43:25 +02:00
parent 518dd1277e
commit e730e12567
3 changed files with 89 additions and 6 deletions
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -9,7 +9,7 @@ import numpy as np
 from datasets import load_dataset

 from arguments import PreprocessingArguments
-from transformers import HfArgumentParser
+from transformers import AutoTokenizer, HfArgumentParser


 def get_hash(example):
@@ -50,18 +50,77 @@ def is_autogenerated(example, scan_width=5):
        return {"autogenerated": False}


+def is_config_or_test(example, scan_width=5, coeff=0.05):
+    """Check if file is a configuration file or a unit test by :
+    1- looking for keywords in the first few lines of the file.
+    2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
+    """
+
+    keywords = ["unit tests", "test file", "configuration file"]
+    lines = example["content"].splitlines()
+    count_config = 0
+    count_test = 0
+    # first test
+    for _, line in zip(range(scan_width), lines):
+        for keyword in keywords:
+            if keyword in line.lower():
+                return {"config_or_test": True}
+    # second test
+    nlines = example["content"].count("\n")
+    threshold = int(coeff * nlines)
+    for line in lines:
+        count_config += line.lower().count("config")
+        count_test += line.lower().count("test")
+        if count_config > threshold or count_test > threshold:
+            return {"config_or_test": True}
+    return {"config_or_test": False}
+
+
+def has_no_keywords(example):
+    """Check if a python file has none of the keywords for: funcion, class, for loop, while loop."""
+    keywords = ["def ", "class ", "for ", "while "]
+    lines = example["content"].splitlines()
+    for line in lines:
+        for keyword in keywords:
+            if keyword in line.lower():
+                return {"has_no_keywords": False}
+    return {"has_no_keywords": True}
+
+
+def has_few_assignments(example, minimum=4):
+    """Check if file uses symbol '=' less than `minimum` times."""
+    lines = example["content"].splitlines()
+    counter = 0
+    for line in lines:
+        counter += line.lower().count("=")
+        if counter > minimum:
+            return {"has_few_assignments": False}
+    return {"has_few_assignments": True}
+
+
+def char_token_ratio(example):
+    """Compute character/token ratio of the file with tokenizer."""
+    input_ids = tokenizer(example["content"], truncation=False)["input_ids"]
+    ratio = len(example["content"]) / len(input_ids)
+    return {"ratio": ratio}
+
+
 def preprocess(example):
    """Chain all preprocessing steps into one function to not fill cache."""
    results = dict()
    results.update(get_hash(example))
    results.update(line_stats(example))
    results.update(alpha_stats(example))
+    results.update(char_token_ratio(example))
    results.update(is_autogenerated(example))
+    results.update(is_config_or_test(example))
+    results.update(has_no_keywords(example))
+    results.update(has_few_assignments(example))
    return results


 def filter(example, uniques, args):
-    """Filter dataset with heuristics."""
+    """Filter dataset with heuristics. Config, test and has_no_keywords files are removed with a given probability."""
    if not check_uniques(example, uniques):
        return False
    elif example["autogenerated"]:
@@ -72,6 +131,14 @@ def filter(example, uniques, args):
        return False
    elif example["alpha_frac"] < args.alpha_frac:
        return False
+    elif example["ratio"] < args.min_token_ratio:
+        return False
+    elif example["config_or_test"] and np.random.rand() <= args.filter_proba:
+        return False
+    elif example["has_no_keywords"] and np.random.rand() <= args.filter_proba:
+        return False
+    elif example["has_few_assignments"]:
+        return False
    else:
        return True

@@ -89,6 +156,7 @@ parser = HfArgumentParser(PreprocessingArguments)
 args = parser.parse_args()
 if args.num_workers is None:
    args.num_workers = multiprocessing.cpu_count()
+tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)

 # Load dataset
 t_start = time.time()