TF Examples Rewrite (#18451)
* Finished QA example * Dodge a merge conflict * Update text classification and LM examples * Update NER example * New Keras metrics WIP, fix NER example * Update NER example * Update MC, summarization and translation examples * Add XLA warnings when shapes are variable * Make sure batch_size is consistently scaled by num_replicas * Add PushToHubCallback to all models * Add docs links for KerasMetricCallback * Add docs links for prepare_tf_dataset and jit_compile * Correct inferred model names * Don't assume the dataset has 'lang' * Don't assume the dataset has 'lang' * Write metrics in text classification * Add 'framework' to TrainingArguments and TFTrainingArguments * Export metrics in all examples and add tests * Fix training args for Flax * Update command line args for translation test * make fixup * Fix accidentally running other tests in fp16 * Remove do_train/do_eval from run_clm.py * Remove do_train/do_eval from run_mlm.py * Add tensorflow tests to circleci * Fix circleci * Update examples/tensorflow/language-modeling/run_mlm.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/test_tensorflow_examples.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/translation/run_translation.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/token-classification/run_ner.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Fix save path for tests * Fix some model card kwargs * Explain the magical -1000 * Actually enable tests this time * Skip text classification PR until we fix shape inference * make fixup Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
@@ -22,9 +22,7 @@ https://huggingface.co/models?filter=fill-mask
|
||||
"""
|
||||
# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
|
||||
|
||||
# TODO Do multi-GPU and TPU tests and make sure the dataset length works as expected
|
||||
# TODO Duplicate all changes over to the CLM script
|
||||
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
@@ -50,6 +48,7 @@ from transformers import (
|
||||
AutoTokenizer,
|
||||
DataCollatorForLanguageModeling,
|
||||
HfArgumentParser,
|
||||
PushToHubCallback,
|
||||
TFAutoModelForMaskedLM,
|
||||
TFTrainingArguments,
|
||||
create_optimizer,
|
||||
@@ -217,22 +216,6 @@ class DataTrainingArguments:
|
||||
# endregion
|
||||
|
||||
|
||||
# region Helper classes
|
||||
class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
||||
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
|
||||
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
|
||||
# that saves the model with this method after each epoch.
|
||||
def __init__(self, output_dir, **kwargs):
|
||||
super().__init__()
|
||||
self.output_dir = output_dir
|
||||
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
self.model.save_pretrained(self.output_dir)
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
def main():
|
||||
# region Argument Parsing
|
||||
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
|
||||
@@ -492,7 +475,7 @@ def main():
|
||||
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||
|
||||
# Log a few random samples from the training set:
|
||||
for index in random.sample(range(len(train_dataset)), 3):
|
||||
for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
|
||||
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
||||
# endregion
|
||||
|
||||
@@ -517,40 +500,88 @@ def main():
|
||||
options = tf.data.Options()
|
||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
|
||||
tf_train_dataset = train_dataset.to_tf_dataset(
|
||||
# labels are passed as input, as we will use the model's internal loss
|
||||
columns=[col for col in train_dataset.features if col != "special_tokens_mask"] + ["labels"],
|
||||
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||
# using model.prepare_tf_dataset()
|
||||
# For more info see the docs:
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||
|
||||
tf_train_dataset = model.prepare_tf_dataset(
|
||||
train_dataset,
|
||||
shuffle=True,
|
||||
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=True,
|
||||
).with_options(options)
|
||||
|
||||
tf_eval_dataset = eval_dataset.to_tf_dataset(
|
||||
tf_eval_dataset = model.prepare_tf_dataset(
|
||||
eval_dataset,
|
||||
# labels are passed as input, as we will use the model's internal loss
|
||||
columns=[col for col in eval_dataset.features if col != "special_tokens_mask"] + ["labels"],
|
||||
shuffle=False,
|
||||
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
||||
batch_size=num_replicas * training_args.per_device_eval_batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=True,
|
||||
).with_options(options)
|
||||
# endregion
|
||||
|
||||
# region Optimizer and loss
|
||||
batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
|
||||
num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
|
||||
if training_args.warmup_steps > 0:
|
||||
num_warmup_steps = training_args.warmup_steps
|
||||
elif training_args.warmup_ratio > 0:
|
||||
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||
else:
|
||||
num_warmup_steps = 0
|
||||
|
||||
# Bias and layernorm weights are automatically excluded from the decay
|
||||
optimizer, lr_schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
|
||||
num_warmup_steps=training_args.warmup_steps,
|
||||
num_train_steps=num_train_steps,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
adam_beta1=training_args.adam_beta1,
|
||||
adam_beta2=training_args.adam_beta2,
|
||||
adam_epsilon=training_args.adam_epsilon,
|
||||
weight_decay_rate=training_args.weight_decay,
|
||||
adam_global_clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
|
||||
# no user-specified loss = will use the model internal loss
|
||||
model.compile(optimizer=optimizer)
|
||||
model.compile(optimizer=optimizer, jit_compile=training_args.xla, run_eagerly=True)
|
||||
# endregion
|
||||
|
||||
# region Preparing push_to_hub and model card
|
||||
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||
if not push_to_hub_model_id:
|
||||
if data_args.dataset_name is not None:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||
else:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-mlm"
|
||||
|
||||
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
|
||||
if data_args.dataset_name is not None:
|
||||
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||
if data_args.dataset_config_name is not None:
|
||||
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||
else:
|
||||
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||
|
||||
if training_args.push_to_hub:
|
||||
callbacks = [
|
||||
PushToHubCallback(
|
||||
output_dir=training_args.output_dir,
|
||||
model_id=push_to_hub_model_id,
|
||||
organization=training_args.push_to_hub_organization,
|
||||
token=training_args.push_to_hub_token,
|
||||
tokenizer=tokenizer,
|
||||
**model_card_kwargs,
|
||||
)
|
||||
]
|
||||
else:
|
||||
callbacks = []
|
||||
# endregion
|
||||
|
||||
# region Training and validation
|
||||
@@ -560,33 +591,46 @@ def main():
|
||||
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
|
||||
logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
|
||||
|
||||
# For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
|
||||
# to the Hugging Face Hub rather than just pushing the finished model.
|
||||
# See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
|
||||
|
||||
history = model.fit(
|
||||
tf_train_dataset,
|
||||
validation_data=tf_eval_dataset,
|
||||
epochs=int(training_args.num_train_epochs),
|
||||
steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
|
||||
callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
|
||||
callbacks=callbacks,
|
||||
)
|
||||
train_loss = history.history["loss"][-1]
|
||||
try:
|
||||
train_perplexity = math.exp(history.history["loss"][-1])
|
||||
train_perplexity = math.exp(train_loss)
|
||||
except OverflowError:
|
||||
train_perplexity = math.inf
|
||||
try:
|
||||
validation_perplexity = math.exp(history.history["val_loss"][-1])
|
||||
except OverflowError:
|
||||
validation_perplexity = math.inf
|
||||
logger.warning(f" Final train loss: {history.history['loss'][-1]:.3f}")
|
||||
logger.warning(f" Final train perplexity: {train_perplexity:.3f}")
|
||||
logger.warning(f" Final validation loss: {history.history['val_loss'][-1]:.3f}")
|
||||
logger.warning(f" Final validation perplexity: {validation_perplexity:.3f}")
|
||||
logger.info(f" Final train loss: {train_loss:.3f}")
|
||||
logger.info(f" Final train perplexity: {train_perplexity:.3f}")
|
||||
|
||||
validation_loss = history.history["val_loss"][-1]
|
||||
try:
|
||||
validation_perplexity = math.exp(validation_loss)
|
||||
except OverflowError:
|
||||
validation_perplexity = math.inf
|
||||
logger.info(f" Final validation loss: {validation_loss:.3f}")
|
||||
logger.info(f" Final validation perplexity: {validation_perplexity:.3f}")
|
||||
|
||||
if training_args.output_dir is not None:
|
||||
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||
results_dict = dict()
|
||||
results_dict["train_loss"] = train_loss
|
||||
results_dict["train_perplexity"] = train_perplexity
|
||||
results_dict["eval_loss"] = validation_loss
|
||||
results_dict["eval_perplexity"] = validation_perplexity
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write(json.dumps(results_dict))
|
||||
# endregion
|
||||
|
||||
if training_args.output_dir is not None:
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
if training_args.push_to_hub:
|
||||
# You'll probably want to append some of your own metadata here!
|
||||
model.push_to_hub()
|
||||
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||
# If we're not pushing to hub, at least save a local copy when we're done
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user