TF Examples Rewrite (#18451)

* Finished QA example

* Dodge a merge conflict

* Update text classification and LM examples

* Update NER example

* New Keras metrics WIP, fix NER example

* Update NER example

* Update MC, summarization and translation examples

* Add XLA warnings when shapes are variable

* Make sure batch_size is consistently scaled by num_replicas

* Add PushToHubCallback to all models

* Add docs links for KerasMetricCallback

* Add docs links for prepare_tf_dataset and jit_compile

* Correct inferred model names

* Don't assume the dataset has 'lang'

* Don't assume the dataset has 'lang'

* Write metrics in text classification

* Add 'framework' to TrainingArguments and TFTrainingArguments

* Export metrics in all examples and add tests

* Fix training args for Flax

* Update command line args for translation test

* make fixup

* Fix accidentally running other tests in fp16

* Remove do_train/do_eval from run_clm.py

* Remove do_train/do_eval from run_mlm.py

* Add tensorflow tests to circleci

* Fix circleci

* Update examples/tensorflow/language-modeling/run_mlm.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/test_tensorflow_examples.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/translation/run_translation.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/token-classification/run_ner.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Fix save path for tests

* Fix some model card kwargs

* Explain the magical -1000

* Actually enable tests this time

* Skip text classification PR until we fix shape inference

* make fixup

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
Matt
2022-08-10 11:49:51 -04:00
committed by GitHub
parent d7e2d7b40b
commit 6eb51450fa
15 changed files with 1490 additions and 660 deletions

View File

@@ -18,11 +18,11 @@ Fine-tuning the library models for summarization.
"""
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
import json
import logging
import os
import sys
from dataclasses import dataclass, field
from functools import partial
from typing import Optional
import datasets
@@ -30,7 +30,6 @@ import nltk # Here to have a nice missing dependency error message early on
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from tqdm import tqdm
import evaluate
import transformers
@@ -38,7 +37,10 @@ from filelock import FileLock
from transformers import (
AutoConfig,
AutoTokenizer,
DataCollatorForSeq2Seq,
HfArgumentParser,
KerasMetricCallback,
PushToHubCallback,
TFAutoModelForSeq2SeqLM,
TFTrainingArguments,
create_optimizer,
@@ -253,7 +255,6 @@ class DataTrainingArguments:
# endregion
# region Dataset name mappings
summarization_name_mapping = {
"amazon_reviews_multi": ("review_body", "review_title"),
@@ -272,71 +273,6 @@ summarization_name_mapping = {
# endregion
# region Data generator
def sample_generator(dataset, model, tokenizer, shuffle, pad_to_multiple_of=None):
if shuffle:
sample_ordering = np.random.permutation(len(dataset))
else:
sample_ordering = np.arange(len(dataset))
for sample_idx in sample_ordering:
example = dataset[int(sample_idx)]
# Handle dicts with proper padding and conversion to tensor.
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()}
if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
decoder_input_ids = model.prepare_decoder_input_ids_from_labels(
labels=tf.expand_dims(example["labels"], 0)
)
example["decoder_input_ids"] = tf.squeeze(decoder_input_ids, 0)
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
return
# endregion
# region Helper functions
def dataset_to_tf(dataset, model, tokenizer, total_batch_size, num_epochs, shuffle):
if dataset is None:
return None
train_generator = partial(sample_generator, dataset, model, tokenizer, shuffle=shuffle)
train_signature = {
feature: tf.TensorSpec(shape=(None,), dtype=tf.int32)
for feature in dataset.features
if feature != "special_tokens_mask"
}
if (
model is not None
and "decoder_input_ids" not in train_signature
and hasattr(model, "prepare_decoder_input_ids_from_labels")
):
train_signature["decoder_input_ids"] = train_signature["labels"]
# This may need to be changed depending on your particular model or tokenizer!
padding_values = {
key: tf.convert_to_tensor(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0, dtype=tf.int32)
for key in train_signature.keys()
}
padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int32)
train_signature["labels"] = train_signature["input_ids"]
train_signature = (train_signature, train_signature["labels"])
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
tf_dataset = (
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
.with_options(options)
.padded_batch(
batch_size=total_batch_size,
drop_remainder=True,
padding_values=(padding_values, np.array(-100, dtype=np.int32)),
)
.repeat(int(num_epochs))
)
return tf_dataset
# endregion
def main():
# region Argument parsing
# See all possible arguments in src/transformers/training_args.py
@@ -587,59 +523,148 @@ def main():
if model.config.decoder_start_token_id is None:
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=label_pad_token_id,
pad_to_multiple_of=128, # Reduce the number of unique shapes for XLA, especially for generation
return_tensors="tf",
)
dataset_options = tf.data.Options()
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
num_replicas = training_args.strategy.num_replicas_in_sync
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
tf_train_dataset = dataset_to_tf(
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
# yourself if you use this method, whereas they are automatically inferred from the model input names when
# using model.prepare_tf_dataset()
# For more info see the docs:
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
tf_train_dataset = model.prepare_tf_dataset(
train_dataset,
model,
tokenizer,
total_batch_size=total_train_batch_size,
num_epochs=training_args.num_train_epochs,
collate_fn=data_collator,
batch_size=total_train_batch_size,
shuffle=True,
)
tf_eval_dataset = dataset_to_tf(
).with_options(dataset_options)
tf_eval_dataset = model.prepare_tf_dataset(
eval_dataset,
model,
tokenizer,
total_eval_batch_size,
num_epochs=1,
collate_fn=data_collator,
batch_size=total_eval_batch_size,
shuffle=False,
)
).with_options(dataset_options)
# endregion
# region Optimizer, loss and LR scheduling
# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = len(train_dataset) // total_train_batch_size
num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch
optimizer, lr_schedule = create_optimizer(
init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=0
)
def masked_sparse_categorical_crossentropy(y_true, y_pred):
# We clip the negative labels to 0 to avoid NaNs appearing in the output and
# fouling up everything that comes afterwards. The loss values corresponding to clipped values
# will be masked later anyway, but even masked NaNs seem to cause overflows for some reason.
# 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely
# event that you have more than 1 million tokens in your vocabulary, consider increasing this value.
# More pragmatically, consider redesigning your tokenizer.
losses = tf.keras.losses.sparse_categorical_crossentropy(
tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True
num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
if training_args.warmup_steps > 0:
num_warmup_steps = training_args.warmup_steps
elif training_args.warmup_ratio > 0:
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
else:
num_warmup_steps = 0
if training_args.do_train:
optimizer, lr_schedule = create_optimizer(
init_lr=training_args.learning_rate,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
adam_beta1=training_args.adam_beta1,
adam_beta2=training_args.adam_beta2,
adam_epsilon=training_args.adam_epsilon,
weight_decay_rate=training_args.weight_decay,
adam_global_clipnorm=training_args.max_grad_norm,
)
# Compute the per-sample loss only over the unmasked tokens
losses = tf.ragged.boolean_mask(losses, y_true != -100)
losses = tf.reduce_mean(losses, axis=-1)
return losses
else:
optimizer = None
# endregion
# region Metric
metric = evaluate.load("rouge")
# region Metric and KerasMetricCallback
if training_args.do_eval:
metric = evaluate.load("rouge")
if data_args.val_max_target_length is None:
data_args.val_max_target_length = data_args.max_target_length
gen_kwargs = {
"max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
"num_beams": data_args.num_beams,
"no_repeat_ngram_size": 0, # Not supported under XLA right now, and some models set it by default
}
def compute_metrics(preds):
predictions, labels = preds
if isinstance(predictions, tuple):
predictions = predictions[0]
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
metrics = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
# Only print the mid f-measures, but there are a lot of other statistics in there too!
metrics = {key: round(val.mid.fmeasure * 100, 4) for key, val in metrics.items()}
return metrics
# The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics
# to be computed each epoch. Any Python code can be included in the metric_fn. This is especially
# useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs.
# For more information, see the docs at
# https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback
metric_callback = KerasMetricCallback(
metric_fn=compute_metrics,
eval_dataset=tf_eval_dataset,
predict_with_generate=True,
use_xla_generation=True,
generate_kwargs=gen_kwargs,
)
callbacks = [metric_callback]
else:
callbacks = []
# endregion
# region Preparing push_to_hub and model card
push_to_hub_model_id = training_args.push_to_hub_model_id
model_name = model_args.model_name_or_path.split("/")[-1]
if not push_to_hub_model_id:
if data_args.dataset_name is not None:
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
else:
push_to_hub_model_id = f"{model_name}-finetuned-summarization"
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"}
if data_args.dataset_name is not None:
model_card_kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
model_card_kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
# Because this training can be quite long, we save once per epoch.
callbacks.append(
PushToHubCallback(
output_dir=training_args.output_dir,
model_id=push_to_hub_model_id,
organization=training_args.push_to_hub_organization,
token=training_args.push_to_hub_token,
tokenizer=tokenizer,
**model_card_kwargs,
)
)
# endregion
# region Training
model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer)
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
eval_metrics = None
if training_args.do_train:
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
@@ -648,28 +673,29 @@ def main():
logger.info(f" Total train batch size = {total_train_batch_size}")
logger.info(f" Total optimization steps = {num_train_steps}")
model.fit(
tf_train_dataset,
epochs=int(training_args.num_train_epochs),
steps_per_epoch=num_update_steps_per_epoch,
)
if training_args.xla and not data_args.pad_to_max_length:
logger.warning(
"XLA training may be slow at first when --pad_to_max_length is not set "
"until all possible shapes have been compiled."
)
history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
eval_metrics = {key: val[-1] for key, val in history.history.items()}
# endregion
# region Validation
if data_args.val_max_target_length is None:
data_args.val_max_target_length = data_args.max_target_length
gen_kwargs = {
"max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
"num_beams": data_args.num_beams,
}
if training_args.do_eval:
if training_args.do_eval and not training_args.do_train:
# Do a standalone evaluation run
logger.info("Evaluation...")
for batch, labels in tqdm(
tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size
):
# Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate
@tf.function(jit_compile=True)
def generate(**kwargs):
return model.generate(**kwargs)
for batch, labels in tf_eval_dataset:
batch.update(gen_kwargs)
generated_tokens = model.generate(**batch)
generated_tokens = generate(**batch)
if isinstance(generated_tokens, tuple):
generated_tokens = generated_tokens[0]
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
@@ -679,13 +705,19 @@ def main():
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
result = metric.compute(use_stemmer=True)
result = {k: round(v * 100, 4) for k, v in result.items()}
eval_metrics = metric.compute(use_stemmer=True)
result = {key: round(val.mid.fmeasure * 100, 4) for key, val in eval_metrics.items()}
logger.info(result)
# endregion
if training_args.output_dir is not None:
if training_args.output_dir is not None and eval_metrics is not None:
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
with open(output_eval_file, "w") as writer:
writer.write(json.dumps(eval_metrics))
if training_args.output_dir is not None and not training_args.push_to_hub:
# If we're not pushing to hub, at least save a local copy when we're done
model.save_pretrained(training_args.output_dir)