CodeParrot data pretokenization (#16932)

* add pretokenization arguments

* add pretokenization script

* add support for pretokenized data

* reformat code

* fix run command for training

* fix model call from config

* remove a package

* add comments on pretokenization in the readme

* remove explicit parallelization

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

* update readme

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

* update readme -remove username

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

* update readme -remove username

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

* keep data parallelization

* reformat code

* reformat code

* update readme

* reformat code

* Update examples/research_projects/codeparrot/README.md

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>
Co-authored-by: Loubna ben allal <loubnabenallal@gmail.com>
This commit is contained in:
Loubna Ben Allal
2022-05-16 15:32:16 +02:00
committed by GitHub
parent e730e12567
commit 05a90579a8
5 changed files with 120 additions and 26 deletions

View File

@@ -9,12 +9,10 @@ class TrainingArguments:
"""
model_ckpt: Optional[str] = field(
default="lvwerra/codeparrot",
metadata={"help": "Model name or path of model to be trained."},
default="lvwerra/codeparrot", metadata={"help": "Model name or path of model to be trained."}
)
save_dir: Optional[str] = field(
default="./",
metadata={"help": "Save dir where model repo is cloned and models updates are saved to."},
default="./", metadata={"help": "Save dir where model repo is cloned and models updates are saved to."}
)
dataset_name_train: Optional[str] = field(
default="lvwerra/codeparrot-clean-train", metadata={"help": "Name or path of training dataset."}
@@ -39,7 +37,7 @@ class TrainingArguments:
gradient_checkpointing: Optional[bool] = field(
default=True, metadata={"help": "Use gradient checkpointing to reduce memory footprint."}
)
max_train_steps: Optional[int] = field(default=50_000, metadata={"help": "Maximum number of training steps."})
max_train_steps: Optional[int] = field(default=50000, metadata={"help": "Maximum number of training steps."})
max_eval_steps: Optional[int] = field(
default=-1, metadata={"help": "Maximum number of evaluation steps. If -1 the full dataset is evaluated."}
)
@@ -50,9 +48,9 @@ class TrainingArguments:
metadata={"help": "Interval to save checkpoints. Measured as number of forward passes not training steps."},
)
resume_from_checkpoint: Optional[str] = field(
default=None,
metadata={"help": "States path if the training should continue from a checkpoint folder."},
default=None, metadata={"help": "States path if the training should continue from a checkpoint folder."}
)
tokenized: Optional[bool] = field(default=False, metadata={"help": "If True the data is pretokenized."})
@dataclass
@@ -62,8 +60,7 @@ class EvaluationArguments:
"""
model_ckpt: Optional[str] = field(
default="lvwerra/codeparrot",
metadata={"help": "Model name or path of model to be evaluated."},
default="lvwerra/codeparrot", metadata={"help": "Model name or path of model to be evaluated."}
)
dataset_name: Optional[str] = field(
default="lvwerra/codeparrot-clean-valid", metadata={"help": "Name or path of validation dataset."}
@@ -83,8 +80,7 @@ class HumanEvalArguments:
"""
model_ckpt: Optional[str] = field(
default="lvwerra/codeparrot",
metadata={"help": "Model name or path of model to be evaluated."},
default="lvwerra/codeparrot", metadata={"help": "Model name or path of model to be evaluated."}
)
num_workers: Optional[int] = field(default=None, metadata={"help": "Number of workers used for code evaluation."})
num_tasks: Optional[int] = field(
@@ -170,14 +166,13 @@ class TokenizerTrainingArguments:
"""
base_tokenizer: Optional[str] = field(
default="gpt2",
metadata={"help": "Base tokenizer to build new tokenizer from."},
default="gpt2", metadata={"help": "Base tokenizer to build new tokenizer from."}
)
dataset_name: Optional[str] = field(
default="transformersbook/codeparrot-train", metadata={"help": "Dataset to train tokenizer on."}
)
text_column: Optional[str] = field(default="content", metadata={"help": "Column containing text data to process."})
vocab_size: Optional[int] = field(default=200000, metadata={"help": "Number of examples to train tokenizer on."})
vocab_size: Optional[int] = field(default=200_000, metadata={"help": "Number of examples to train tokenizer on."})
n_examples: Optional[int] = field(
default=32768, metadata={"help": "Number of examples to train the tokenizer on."}
)
@@ -185,6 +180,24 @@ class TokenizerTrainingArguments:
push_to_hub: Optional[bool] = field(default=True, metadata={"help": "Push saved tokenizer to the hub."})
@dataclass
class PretokenizationArguments:
"""
Configuration for data pretokenization.
"""
tokenizer_dir: Optional[str] = field(
default="lvwerra/codeparrot", metadata={"help": "Name or path to the tokenizer."}
)
dataset_name: Optional[str] = field(
default="lvwerra/codeparrot-clean-train", metadata={"help": "Name or path to the dataset to pretokenize."}
)
tokenized_data_repo: Optional[str] = field(
default="tokenized-codeparrot-train", metadata={"help": "Repo name of the pretokenized data."}
)
num_workers: Optional[int] = field(default=None, metadata={"help": "Number of workers used for code evaluation."})
@dataclass
class InitializationArguments:
"""
@@ -192,8 +205,7 @@ class InitializationArguments:
"""
config_name: Optional[str] = field(
default="gpt2-large",
metadata={"help": "Configuration to use for model initialization."},
default="gpt2-large", metadata={"help": "Configuration to use for model initialization."}
)
tokenizer_name: Optional[str] = field(
default="lvwerra/codeparrot", metadata={"help": "Tokenizer attached to model."}