diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 8f7693ae5b..3987b6d20d 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -473,9 +473,14 @@ def main(): # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - data_collator = DataCollatorForMultipleChoice( - tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None) - ) + # For fp8, we pad to multiple of 16. + if accelerator.mixed_precision == "fp8": + pad_to_multiple_of = 16 + elif accelerator.mixed_precision != "no": + pad_to_multiple_of = 8 + else: + pad_to_multiple_of = None + data_collator = DataCollatorForMultipleChoice(tokenizer, pad_to_multiple_of=pad_to_multiple_of) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index ee791c0c8d..f8e2f56f8e 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -670,7 +670,14 @@ def main(): # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) + # For fp8, we pad to multiple of 16. + if accelerator.mixed_precision == "fp8": + pad_to_multiple_of = 16 + elif accelerator.mixed_precision != "no": + pad_to_multiple_of = 8 + else: + pad_to_multiple_of = None + data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 7ae0d488bc..f0a22e5163 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -685,7 +685,14 @@ def main(): # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) + # For fp8, we pad to multiple of 16. + if accelerator.mixed_precision == "fp8": + pad_to_multiple_of = 16 + elif accelerator.mixed_precision != "no": + pad_to_multiple_of = 8 + else: + pad_to_multiple_of = None + data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 36cd590ea5..21da107000 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -534,11 +534,17 @@ def main(): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id + if accelerator.mixed_precision == "fp8": + pad_to_multiple_of = 16 + elif accelerator.mixed_precision != "no": + pad_to_multiple_of = 8 + else: + pad_to_multiple_of = None data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, - pad_to_multiple_of=8 if accelerator.use_fp16 else None, + pad_to_multiple_of=pad_to_multiple_of, ) def postprocess_text(preds, labels): diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index ac62edbe5e..da9193ab1c 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -426,7 +426,14 @@ def main(): # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) + # For fp8, we pad to multiple of 16. + if accelerator.mixed_precision == "fp8": + pad_to_multiple_of = 16 + elif accelerator.mixed_precision != "no": + pad_to_multiple_of = 8 + else: + pad_to_multiple_of = None + data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 2afb38bb44..77016e2a6c 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -541,9 +541,14 @@ def main(): # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - data_collator = DataCollatorForTokenClassification( - tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None) - ) + # For fp8, we pad to multiple of 16. + if accelerator.mixed_precision == "fp8": + pad_to_multiple_of = 16 + elif accelerator.mixed_precision != "no": + pad_to_multiple_of = 8 + else: + pad_to_multiple_of = None + data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=pad_to_multiple_of) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 97da3f9541..70ef92284d 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -517,11 +517,18 @@ def main(): # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). + # For fp8, we pad to multiple of 16. + if accelerator.mixed_precision == "fp8": + pad_to_multiple_of = 16 + elif accelerator.mixed_precision != "no": + pad_to_multiple_of = 8 + else: + pad_to_multiple_of = None data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, - pad_to_multiple_of=8 if accelerator.use_fp16 else None, + pad_to_multiple_of=pad_to_multiple_of, ) train_dataloader = DataLoader( diff --git a/examples/research_projects/luke/run_luke_ner_no_trainer.py b/examples/research_projects/luke/run_luke_ner_no_trainer.py index cac487b059..1552acbd42 100644 --- a/examples/research_projects/luke/run_luke_ner_no_trainer.py +++ b/examples/research_projects/luke/run_luke_ner_no_trainer.py @@ -542,9 +542,14 @@ def main(): # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - data_collator = DataCollatorForLukeTokenClassification( - tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None) - ) + # For fp8, we pad to multiple of 16. + if accelerator.mixed_precision == "fp8": + pad_to_multiple_of = 16 + elif accelerator.mixed_precision != "no": + pad_to_multiple_of = 8 + else: + pad_to_multiple_of = None + data_collator = DataCollatorForLukeTokenClassification(tokenizer, pad_to_multiple_of=pad_to_multiple_of) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size diff --git a/examples/research_projects/self-training-text-classification/finetuning.py b/examples/research_projects/self-training-text-classification/finetuning.py index 0afff6a91e..4bf9eb28df 100644 --- a/examples/research_projects/self-training-text-classification/finetuning.py +++ b/examples/research_projects/self-training-text-classification/finetuning.py @@ -704,7 +704,14 @@ def finetune(accelerator, model_name_or_path, train_file, output_dir, **kwargs): # precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple of # 8s, which will enable the use of Tensor Cores on NVIDIA hardware with # compute capability >= 7.5 (Volta). - data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) + # For fp8, we pad to multiple of 16. + if accelerator.mixed_precision == "fp8": + pad_to_multiple_of = 16 + elif accelerator.mixed_precision != "no": + pad_to_multiple_of = 8 + else: + pad_to_multiple_of = None + data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of) train_dataloader = DataLoader( train_dataset, diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py index 5c39262cc9..0b27b49212 100755 --- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py +++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py @@ -836,7 +836,14 @@ def main(): # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) + # For fp8, we pad to multiple of 16. + if accelerator.mixed_precision == "fp8": + pad_to_multiple_of = 16 + elif accelerator.mixed_precision != "no": + pad_to_multiple_of = 8 + else: + pad_to_multiple_of = None + data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size