Update codeparrot data preprocessing (#16944)

* add new preprocessing arguments

* add new filters

* add new filters to readme

* fix config and test count, update function names and docstrings

* reformat code

* update readme

* Update readme

* rename config_test filter

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

* rename few_assignments filter

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

* rename tokenizer in arguments

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

* rename functions and add limit_line argument for config_test filter

* update threshold for config_test filter

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>
Co-authored-by: Loubna ben allal <loubnabenallal@gmail.com>
This commit is contained in:
Loubna Ben Allal
2022-05-16 14:43:25 +02:00
committed by GitHub
parent 518dd1277e
commit e730e12567
3 changed files with 89 additions and 6 deletions

View File

@@ -133,7 +133,7 @@ class PreprocessingArguments:
},
)
dataset_name: Optional[str] = field(
default="codeparrot", metadata={"help": "Folder or name of dataset to process."}
default="transformersbook/codeparrot", metadata={"help": "Folder or name of dataset to process."}
)
output_dir: Optional[str] = field(
default="codeparrot-clean", metadata={"help": "Folder to save processed processed dataset."}
@@ -151,6 +151,16 @@ class PreprocessingArguments:
alpha_frac: Optional[float] = field(
default=0.25, metadata={"help": "Maximum fraction of non-alphanumeric characters, otherwise file is filtered."}
)
min_token_ratio: Optional[float] = field(
default=1.5, metadata={"help": "Minimum character token ratio for the file, otherwise file is filtered."}
)
filter_proba: Optional[float] = field(
default=0.7, metadata={"help": "Probability for filtering config, test and uncommon files."}
)
tokenizer: Optional[str] = field(
default="lvwerra/codeparrot",
metadata={"help": "Name or path to the tokenizer."},
)
@dataclass