TF Examples Rewrite (#18451)
* Finished QA example * Dodge a merge conflict * Update text classification and LM examples * Update NER example * New Keras metrics WIP, fix NER example * Update NER example * Update MC, summarization and translation examples * Add XLA warnings when shapes are variable * Make sure batch_size is consistently scaled by num_replicas * Add PushToHubCallback to all models * Add docs links for KerasMetricCallback * Add docs links for prepare_tf_dataset and jit_compile * Correct inferred model names * Don't assume the dataset has 'lang' * Don't assume the dataset has 'lang' * Write metrics in text classification * Add 'framework' to TrainingArguments and TFTrainingArguments * Export metrics in all examples and add tests * Fix training args for Flax * Update command line args for translation test * make fixup * Fix accidentally running other tests in fp16 * Remove do_train/do_eval from run_clm.py * Remove do_train/do_eval from run_mlm.py * Add tensorflow tests to circleci * Fix circleci * Update examples/tensorflow/language-modeling/run_mlm.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/test_tensorflow_examples.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/translation/run_translation.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/token-classification/run_ner.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Fix save path for tests * Fix some model card kwargs * Explain the magical -1000 * Actually enable tests this time * Skip text classification PR until we fix shape inference * make fixup Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
@@ -18,11 +18,11 @@ Fine-tuning the library models for summarization.
|
||||
"""
|
||||
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from functools import partial
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
@@ -30,7 +30,6 @@ import nltk # Here to have a nice missing dependency error message early on
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
import evaluate
|
||||
import transformers
|
||||
@@ -38,7 +37,10 @@ from filelock import FileLock
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
DataCollatorForSeq2Seq,
|
||||
HfArgumentParser,
|
||||
KerasMetricCallback,
|
||||
PushToHubCallback,
|
||||
TFAutoModelForSeq2SeqLM,
|
||||
TFTrainingArguments,
|
||||
create_optimizer,
|
||||
@@ -253,7 +255,6 @@ class DataTrainingArguments:
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
# region Dataset name mappings
|
||||
summarization_name_mapping = {
|
||||
"amazon_reviews_multi": ("review_body", "review_title"),
|
||||
@@ -272,71 +273,6 @@ summarization_name_mapping = {
|
||||
# endregion
|
||||
|
||||
|
||||
# region Data generator
|
||||
def sample_generator(dataset, model, tokenizer, shuffle, pad_to_multiple_of=None):
|
||||
if shuffle:
|
||||
sample_ordering = np.random.permutation(len(dataset))
|
||||
else:
|
||||
sample_ordering = np.arange(len(dataset))
|
||||
for sample_idx in sample_ordering:
|
||||
example = dataset[int(sample_idx)]
|
||||
# Handle dicts with proper padding and conversion to tensor.
|
||||
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
|
||||
example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()}
|
||||
if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
|
||||
decoder_input_ids = model.prepare_decoder_input_ids_from_labels(
|
||||
labels=tf.expand_dims(example["labels"], 0)
|
||||
)
|
||||
example["decoder_input_ids"] = tf.squeeze(decoder_input_ids, 0)
|
||||
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
|
||||
return
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
# region Helper functions
|
||||
def dataset_to_tf(dataset, model, tokenizer, total_batch_size, num_epochs, shuffle):
|
||||
if dataset is None:
|
||||
return None
|
||||
train_generator = partial(sample_generator, dataset, model, tokenizer, shuffle=shuffle)
|
||||
train_signature = {
|
||||
feature: tf.TensorSpec(shape=(None,), dtype=tf.int32)
|
||||
for feature in dataset.features
|
||||
if feature != "special_tokens_mask"
|
||||
}
|
||||
if (
|
||||
model is not None
|
||||
and "decoder_input_ids" not in train_signature
|
||||
and hasattr(model, "prepare_decoder_input_ids_from_labels")
|
||||
):
|
||||
train_signature["decoder_input_ids"] = train_signature["labels"]
|
||||
# This may need to be changed depending on your particular model or tokenizer!
|
||||
padding_values = {
|
||||
key: tf.convert_to_tensor(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0, dtype=tf.int32)
|
||||
for key in train_signature.keys()
|
||||
}
|
||||
padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int32)
|
||||
train_signature["labels"] = train_signature["input_ids"]
|
||||
train_signature = (train_signature, train_signature["labels"])
|
||||
options = tf.data.Options()
|
||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
tf_dataset = (
|
||||
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
|
||||
.with_options(options)
|
||||
.padded_batch(
|
||||
batch_size=total_batch_size,
|
||||
drop_remainder=True,
|
||||
padding_values=(padding_values, np.array(-100, dtype=np.int32)),
|
||||
)
|
||||
.repeat(int(num_epochs))
|
||||
)
|
||||
return tf_dataset
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
def main():
|
||||
# region Argument parsing
|
||||
# See all possible arguments in src/transformers/training_args.py
|
||||
@@ -587,59 +523,148 @@ def main():
|
||||
if model.config.decoder_start_token_id is None:
|
||||
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
|
||||
|
||||
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
||||
data_collator = DataCollatorForSeq2Seq(
|
||||
tokenizer,
|
||||
model=model,
|
||||
label_pad_token_id=label_pad_token_id,
|
||||
pad_to_multiple_of=128, # Reduce the number of unique shapes for XLA, especially for generation
|
||||
return_tensors="tf",
|
||||
)
|
||||
|
||||
dataset_options = tf.data.Options()
|
||||
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
|
||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||
tf_train_dataset = dataset_to_tf(
|
||||
|
||||
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||
# using model.prepare_tf_dataset()
|
||||
# For more info see the docs:
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||
|
||||
tf_train_dataset = model.prepare_tf_dataset(
|
||||
train_dataset,
|
||||
model,
|
||||
tokenizer,
|
||||
total_batch_size=total_train_batch_size,
|
||||
num_epochs=training_args.num_train_epochs,
|
||||
collate_fn=data_collator,
|
||||
batch_size=total_train_batch_size,
|
||||
shuffle=True,
|
||||
)
|
||||
tf_eval_dataset = dataset_to_tf(
|
||||
).with_options(dataset_options)
|
||||
tf_eval_dataset = model.prepare_tf_dataset(
|
||||
eval_dataset,
|
||||
model,
|
||||
tokenizer,
|
||||
total_eval_batch_size,
|
||||
num_epochs=1,
|
||||
collate_fn=data_collator,
|
||||
batch_size=total_eval_batch_size,
|
||||
shuffle=False,
|
||||
)
|
||||
).with_options(dataset_options)
|
||||
# endregion
|
||||
|
||||
# region Optimizer, loss and LR scheduling
|
||||
# Scheduler and math around the number of training steps.
|
||||
num_update_steps_per_epoch = len(train_dataset) // total_train_batch_size
|
||||
num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch
|
||||
optimizer, lr_schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=0
|
||||
)
|
||||
|
||||
def masked_sparse_categorical_crossentropy(y_true, y_pred):
|
||||
# We clip the negative labels to 0 to avoid NaNs appearing in the output and
|
||||
# fouling up everything that comes afterwards. The loss values corresponding to clipped values
|
||||
# will be masked later anyway, but even masked NaNs seem to cause overflows for some reason.
|
||||
# 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely
|
||||
# event that you have more than 1 million tokens in your vocabulary, consider increasing this value.
|
||||
# More pragmatically, consider redesigning your tokenizer.
|
||||
losses = tf.keras.losses.sparse_categorical_crossentropy(
|
||||
tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True
|
||||
num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
|
||||
if training_args.warmup_steps > 0:
|
||||
num_warmup_steps = training_args.warmup_steps
|
||||
elif training_args.warmup_ratio > 0:
|
||||
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||
else:
|
||||
num_warmup_steps = 0
|
||||
if training_args.do_train:
|
||||
optimizer, lr_schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=num_train_steps,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
adam_beta1=training_args.adam_beta1,
|
||||
adam_beta2=training_args.adam_beta2,
|
||||
adam_epsilon=training_args.adam_epsilon,
|
||||
weight_decay_rate=training_args.weight_decay,
|
||||
adam_global_clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
# Compute the per-sample loss only over the unmasked tokens
|
||||
losses = tf.ragged.boolean_mask(losses, y_true != -100)
|
||||
losses = tf.reduce_mean(losses, axis=-1)
|
||||
return losses
|
||||
else:
|
||||
optimizer = None
|
||||
|
||||
# endregion
|
||||
|
||||
# region Metric
|
||||
metric = evaluate.load("rouge")
|
||||
# region Metric and KerasMetricCallback
|
||||
if training_args.do_eval:
|
||||
metric = evaluate.load("rouge")
|
||||
|
||||
if data_args.val_max_target_length is None:
|
||||
data_args.val_max_target_length = data_args.max_target_length
|
||||
|
||||
gen_kwargs = {
|
||||
"max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
|
||||
"num_beams": data_args.num_beams,
|
||||
"no_repeat_ngram_size": 0, # Not supported under XLA right now, and some models set it by default
|
||||
}
|
||||
|
||||
def compute_metrics(preds):
|
||||
predictions, labels = preds
|
||||
if isinstance(predictions, tuple):
|
||||
predictions = predictions[0]
|
||||
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
|
||||
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
|
||||
metrics = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
|
||||
# Only print the mid f-measures, but there are a lot of other statistics in there too!
|
||||
metrics = {key: round(val.mid.fmeasure * 100, 4) for key, val in metrics.items()}
|
||||
return metrics
|
||||
|
||||
# The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics
|
||||
# to be computed each epoch. Any Python code can be included in the metric_fn. This is especially
|
||||
# useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs.
|
||||
# For more information, see the docs at
|
||||
# https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback
|
||||
|
||||
metric_callback = KerasMetricCallback(
|
||||
metric_fn=compute_metrics,
|
||||
eval_dataset=tf_eval_dataset,
|
||||
predict_with_generate=True,
|
||||
use_xla_generation=True,
|
||||
generate_kwargs=gen_kwargs,
|
||||
)
|
||||
callbacks = [metric_callback]
|
||||
else:
|
||||
callbacks = []
|
||||
# endregion
|
||||
|
||||
# region Preparing push_to_hub and model card
|
||||
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||
if not push_to_hub_model_id:
|
||||
if data_args.dataset_name is not None:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||
else:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-summarization"
|
||||
|
||||
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"}
|
||||
if data_args.dataset_name is not None:
|
||||
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||
if data_args.dataset_config_name is not None:
|
||||
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||
else:
|
||||
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||
|
||||
if training_args.push_to_hub:
|
||||
# Because this training can be quite long, we save once per epoch.
|
||||
callbacks.append(
|
||||
PushToHubCallback(
|
||||
output_dir=training_args.output_dir,
|
||||
model_id=push_to_hub_model_id,
|
||||
organization=training_args.push_to_hub_organization,
|
||||
token=training_args.push_to_hub_token,
|
||||
tokenizer=tokenizer,
|
||||
**model_card_kwargs,
|
||||
)
|
||||
)
|
||||
# endregion
|
||||
|
||||
# region Training
|
||||
model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer)
|
||||
|
||||
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
|
||||
eval_metrics = None
|
||||
if training_args.do_train:
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(f" Num examples = {len(train_dataset)}")
|
||||
@@ -648,28 +673,29 @@ def main():
|
||||
logger.info(f" Total train batch size = {total_train_batch_size}")
|
||||
logger.info(f" Total optimization steps = {num_train_steps}")
|
||||
|
||||
model.fit(
|
||||
tf_train_dataset,
|
||||
epochs=int(training_args.num_train_epochs),
|
||||
steps_per_epoch=num_update_steps_per_epoch,
|
||||
)
|
||||
if training_args.xla and not data_args.pad_to_max_length:
|
||||
logger.warning(
|
||||
"XLA training may be slow at first when --pad_to_max_length is not set "
|
||||
"until all possible shapes have been compiled."
|
||||
)
|
||||
history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
|
||||
eval_metrics = {key: val[-1] for key, val in history.history.items()}
|
||||
# endregion
|
||||
|
||||
# region Validation
|
||||
if data_args.val_max_target_length is None:
|
||||
data_args.val_max_target_length = data_args.max_target_length
|
||||
|
||||
gen_kwargs = {
|
||||
"max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
|
||||
"num_beams": data_args.num_beams,
|
||||
}
|
||||
if training_args.do_eval:
|
||||
if training_args.do_eval and not training_args.do_train:
|
||||
# Do a standalone evaluation run
|
||||
logger.info("Evaluation...")
|
||||
for batch, labels in tqdm(
|
||||
tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size
|
||||
):
|
||||
|
||||
# Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate
|
||||
@tf.function(jit_compile=True)
|
||||
def generate(**kwargs):
|
||||
return model.generate(**kwargs)
|
||||
|
||||
for batch, labels in tf_eval_dataset:
|
||||
batch.update(gen_kwargs)
|
||||
generated_tokens = model.generate(**batch)
|
||||
generated_tokens = generate(**batch)
|
||||
if isinstance(generated_tokens, tuple):
|
||||
generated_tokens = generated_tokens[0]
|
||||
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
||||
@@ -679,13 +705,19 @@ def main():
|
||||
|
||||
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
|
||||
|
||||
result = metric.compute(use_stemmer=True)
|
||||
result = {k: round(v * 100, 4) for k, v in result.items()}
|
||||
eval_metrics = metric.compute(use_stemmer=True)
|
||||
|
||||
result = {key: round(val.mid.fmeasure * 100, 4) for key, val in eval_metrics.items()}
|
||||
logger.info(result)
|
||||
# endregion
|
||||
|
||||
if training_args.output_dir is not None:
|
||||
if training_args.output_dir is not None and eval_metrics is not None:
|
||||
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write(json.dumps(eval_metrics))
|
||||
|
||||
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||
# If we're not pushing to hub, at least save a local copy when we're done
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user