Replace accelerator.use_fp16 in examples (#33513)
* Replace `accelerator.use_fp16` in examples * pad_to_multiple_of=16 for fp8
This commit is contained in:
@@ -473,9 +473,14 @@ def main():
|
|||||||
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
|
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
|
||||||
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
||||||
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
||||||
data_collator = DataCollatorForMultipleChoice(
|
# For fp8, we pad to multiple of 16.
|
||||||
tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
|
if accelerator.mixed_precision == "fp8":
|
||||||
)
|
pad_to_multiple_of = 16
|
||||||
|
elif accelerator.mixed_precision != "no":
|
||||||
|
pad_to_multiple_of = 8
|
||||||
|
else:
|
||||||
|
pad_to_multiple_of = None
|
||||||
|
data_collator = DataCollatorForMultipleChoice(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
|
||||||
|
|
||||||
train_dataloader = DataLoader(
|
train_dataloader = DataLoader(
|
||||||
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
||||||
|
|||||||
@@ -670,7 +670,14 @@ def main():
|
|||||||
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
|
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
|
||||||
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
||||||
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
||||||
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
|
# For fp8, we pad to multiple of 16.
|
||||||
|
if accelerator.mixed_precision == "fp8":
|
||||||
|
pad_to_multiple_of = 16
|
||||||
|
elif accelerator.mixed_precision != "no":
|
||||||
|
pad_to_multiple_of = 8
|
||||||
|
else:
|
||||||
|
pad_to_multiple_of = None
|
||||||
|
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
|
||||||
|
|
||||||
train_dataloader = DataLoader(
|
train_dataloader = DataLoader(
|
||||||
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
||||||
|
|||||||
@@ -685,7 +685,14 @@ def main():
|
|||||||
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
|
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
|
||||||
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
||||||
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
||||||
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
|
# For fp8, we pad to multiple of 16.
|
||||||
|
if accelerator.mixed_precision == "fp8":
|
||||||
|
pad_to_multiple_of = 16
|
||||||
|
elif accelerator.mixed_precision != "no":
|
||||||
|
pad_to_multiple_of = 8
|
||||||
|
else:
|
||||||
|
pad_to_multiple_of = None
|
||||||
|
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
|
||||||
|
|
||||||
train_dataloader = DataLoader(
|
train_dataloader = DataLoader(
|
||||||
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
||||||
|
|||||||
@@ -534,11 +534,17 @@ def main():
|
|||||||
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
||||||
|
|
||||||
label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
||||||
|
if accelerator.mixed_precision == "fp8":
|
||||||
|
pad_to_multiple_of = 16
|
||||||
|
elif accelerator.mixed_precision != "no":
|
||||||
|
pad_to_multiple_of = 8
|
||||||
|
else:
|
||||||
|
pad_to_multiple_of = None
|
||||||
data_collator = DataCollatorForSeq2Seq(
|
data_collator = DataCollatorForSeq2Seq(
|
||||||
tokenizer,
|
tokenizer,
|
||||||
model=model,
|
model=model,
|
||||||
label_pad_token_id=label_pad_token_id,
|
label_pad_token_id=label_pad_token_id,
|
||||||
pad_to_multiple_of=8 if accelerator.use_fp16 else None,
|
pad_to_multiple_of=pad_to_multiple_of,
|
||||||
)
|
)
|
||||||
|
|
||||||
def postprocess_text(preds, labels):
|
def postprocess_text(preds, labels):
|
||||||
|
|||||||
@@ -426,7 +426,14 @@ def main():
|
|||||||
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
|
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
|
||||||
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
||||||
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
||||||
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
|
# For fp8, we pad to multiple of 16.
|
||||||
|
if accelerator.mixed_precision == "fp8":
|
||||||
|
pad_to_multiple_of = 16
|
||||||
|
elif accelerator.mixed_precision != "no":
|
||||||
|
pad_to_multiple_of = 8
|
||||||
|
else:
|
||||||
|
pad_to_multiple_of = None
|
||||||
|
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
|
||||||
|
|
||||||
train_dataloader = DataLoader(
|
train_dataloader = DataLoader(
|
||||||
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
||||||
|
|||||||
@@ -541,9 +541,14 @@ def main():
|
|||||||
# Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
|
# Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
|
||||||
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
||||||
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
||||||
data_collator = DataCollatorForTokenClassification(
|
# For fp8, we pad to multiple of 16.
|
||||||
tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
|
if accelerator.mixed_precision == "fp8":
|
||||||
)
|
pad_to_multiple_of = 16
|
||||||
|
elif accelerator.mixed_precision != "no":
|
||||||
|
pad_to_multiple_of = 8
|
||||||
|
else:
|
||||||
|
pad_to_multiple_of = None
|
||||||
|
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
|
||||||
|
|
||||||
train_dataloader = DataLoader(
|
train_dataloader = DataLoader(
|
||||||
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
||||||
|
|||||||
@@ -517,11 +517,18 @@ def main():
|
|||||||
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
|
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
|
||||||
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
||||||
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
||||||
|
# For fp8, we pad to multiple of 16.
|
||||||
|
if accelerator.mixed_precision == "fp8":
|
||||||
|
pad_to_multiple_of = 16
|
||||||
|
elif accelerator.mixed_precision != "no":
|
||||||
|
pad_to_multiple_of = 8
|
||||||
|
else:
|
||||||
|
pad_to_multiple_of = None
|
||||||
data_collator = DataCollatorForSeq2Seq(
|
data_collator = DataCollatorForSeq2Seq(
|
||||||
tokenizer,
|
tokenizer,
|
||||||
model=model,
|
model=model,
|
||||||
label_pad_token_id=label_pad_token_id,
|
label_pad_token_id=label_pad_token_id,
|
||||||
pad_to_multiple_of=8 if accelerator.use_fp16 else None,
|
pad_to_multiple_of=pad_to_multiple_of,
|
||||||
)
|
)
|
||||||
|
|
||||||
train_dataloader = DataLoader(
|
train_dataloader = DataLoader(
|
||||||
|
|||||||
@@ -542,9 +542,14 @@ def main():
|
|||||||
# Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
|
# Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
|
||||||
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
||||||
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
||||||
data_collator = DataCollatorForLukeTokenClassification(
|
# For fp8, we pad to multiple of 16.
|
||||||
tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
|
if accelerator.mixed_precision == "fp8":
|
||||||
)
|
pad_to_multiple_of = 16
|
||||||
|
elif accelerator.mixed_precision != "no":
|
||||||
|
pad_to_multiple_of = 8
|
||||||
|
else:
|
||||||
|
pad_to_multiple_of = None
|
||||||
|
data_collator = DataCollatorForLukeTokenClassification(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
|
||||||
|
|
||||||
train_dataloader = DataLoader(
|
train_dataloader = DataLoader(
|
||||||
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
||||||
|
|||||||
@@ -704,7 +704,14 @@ def finetune(accelerator, model_name_or_path, train_file, output_dir, **kwargs):
|
|||||||
# precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple of
|
# precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple of
|
||||||
# 8s, which will enable the use of Tensor Cores on NVIDIA hardware with
|
# 8s, which will enable the use of Tensor Cores on NVIDIA hardware with
|
||||||
# compute capability >= 7.5 (Volta).
|
# compute capability >= 7.5 (Volta).
|
||||||
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
|
# For fp8, we pad to multiple of 16.
|
||||||
|
if accelerator.mixed_precision == "fp8":
|
||||||
|
pad_to_multiple_of = 16
|
||||||
|
elif accelerator.mixed_precision != "no":
|
||||||
|
pad_to_multiple_of = 8
|
||||||
|
else:
|
||||||
|
pad_to_multiple_of = None
|
||||||
|
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
|
||||||
|
|
||||||
train_dataloader = DataLoader(
|
train_dataloader = DataLoader(
|
||||||
train_dataset,
|
train_dataset,
|
||||||
|
|||||||
@@ -836,7 +836,14 @@ def main():
|
|||||||
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
|
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
|
||||||
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
|
||||||
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
||||||
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
|
# For fp8, we pad to multiple of 16.
|
||||||
|
if accelerator.mixed_precision == "fp8":
|
||||||
|
pad_to_multiple_of = 16
|
||||||
|
elif accelerator.mixed_precision != "no":
|
||||||
|
pad_to_multiple_of = 8
|
||||||
|
else:
|
||||||
|
pad_to_multiple_of = None
|
||||||
|
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
|
||||||
|
|
||||||
train_dataloader = DataLoader(
|
train_dataloader = DataLoader(
|
||||||
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
|
||||||
|
|||||||
Reference in New Issue
Block a user