Fix CodeParrot training script (#17291)

* average loss over batches and accumulated steps for tracking

* fix layernorm weight decay

* use AdamW from Pytorch instead of Transformers

* add shuffling of sequences inside the batches

* add shuffling of sequences inside the batches

* add logging dir and reformat code

* fix lr tracking

* remove Mistral scaling

* keep Mistral scaling

* reformat code

* fix error

* fix error

* use shuffling function from Pytorch

* remove argument for shuffling batch sequences as it isn't optional

* update package versions and install accelerate from source

* remove unused package

* Update loss average over accumulated steps

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

* Update loss average over accumulated steps

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

* use one shuffle buffer argument

* compute avg_loss in one line

Co-authored-by: Loubna ben allal <loubnabenallal@gmail.com>
Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>
This commit is contained in:
Loubna Ben Allal
2022-05-23 12:55:35 +02:00
committed by GitHub
parent b9bb417324
commit b48ac1a094
4 changed files with 43 additions and 21 deletions

View File

@@ -24,7 +24,7 @@ class TrainingArguments:
valid_batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size for evaluation."})
weight_decay: Optional[float] = field(default=0.1, metadata={"help": "Value of weight decay."})
shuffle_buffer: Optional[int] = field(
default=1000, metadata={"help": "Size of buffer used to shuffle streaming dataset."}
default=10000, metadata={"help": "Size of buffer used to shuffle streaming dataset."}
)
learning_rate: Optional[float] = field(default=2e-4, metadata={"help": "Learning rate fo training."})
lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "Learning rate."})