TF Examples Rewrite (#18451)

* Finished QA example

* Dodge a merge conflict

* Update text classification and LM examples

* Update NER example

* New Keras metrics WIP, fix NER example

* Update NER example

* Update MC, summarization and translation examples

* Add XLA warnings when shapes are variable

* Make sure batch_size is consistently scaled by num_replicas

* Add PushToHubCallback to all models

* Add docs links for KerasMetricCallback

* Add docs links for prepare_tf_dataset and jit_compile

* Correct inferred model names

* Don't assume the dataset has 'lang'

* Don't assume the dataset has 'lang'

* Write metrics in text classification

* Add 'framework' to TrainingArguments and TFTrainingArguments

* Export metrics in all examples and add tests

* Fix training args for Flax

* Update command line args for translation test

* make fixup

* Fix accidentally running other tests in fp16

* Remove do_train/do_eval from run_clm.py

* Remove do_train/do_eval from run_mlm.py

* Add tensorflow tests to circleci

* Fix circleci

* Update examples/tensorflow/language-modeling/run_mlm.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/test_tensorflow_examples.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/translation/run_translation.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/token-classification/run_ner.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Fix save path for tests

* Fix some model card kwargs

* Explain the magical -1000

* Actually enable tests this time

* Skip text classification PR until we fix shape inference

* make fixup

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
Matt
2022-08-10 11:49:51 -04:00
committed by GitHub
parent d7e2d7b40b
commit 6eb51450fa
15 changed files with 1490 additions and 660 deletions

View File

@@ -18,14 +18,14 @@ Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, C
without using a Trainer.
"""
import json
import logging
import os
import random
from dataclasses import dataclass, field
from functools import partial
from typing import Optional
import datasets
import numpy as np
import tensorflow as tf
from datasets import ClassLabel, load_dataset
@@ -33,10 +33,11 @@ import evaluate
import transformers
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
AutoConfig,
AutoTokenizer,
DataCollatorForTokenClassification,
HfArgumentParser,
PushToHubCallback,
TFAutoModelForTokenClassification,
TFTrainingArguments,
create_optimizer,
@@ -48,11 +49,7 @@ from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
# You should update this to your particular problem to have better documentation of `model_type`
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/token-classification/requirements.txt")
# region Command-line arguments
@@ -195,61 +192,6 @@ class DataTrainingArguments:
# endregion
# region Data generator
def sample_generator(dataset, tokenizer, shuffle, pad_to_multiple_of=None):
# Trim off the last partial batch if present
if shuffle:
sample_ordering = np.random.permutation(len(dataset))
else:
sample_ordering = np.arange(len(dataset))
for sample_idx in sample_ordering:
example = dataset[int(sample_idx)]
# Handle dicts with proper padding and conversion to tensor.
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
if tokenizer.pad_token_id is not None:
example["labels"][example["attention_mask"] == 0] = -100
example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()}
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
return
# endregion
# region Helper functions
def dataset_to_tf(dataset, tokenizer, total_batch_size, num_epochs, shuffle):
train_generator = partial(sample_generator, dataset, tokenizer, shuffle=shuffle)
train_signature = {
feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
for feature in dataset.features
if feature != "special_tokens_mask"
}
# This may need to be changed depending on your particular model or tokenizer!
padding_values = {key: tf.convert_to_tensor(0, dtype=tf.int64) for key in dataset.features}
padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int64)
if tokenizer.pad_token_id is not None:
padding_values["input_ids"] = tf.convert_to_tensor(tokenizer.pad_token_id, dtype=tf.int64)
train_signature["labels"] = train_signature["input_ids"]
train_signature = (train_signature, train_signature["labels"])
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
tf_dataset = (
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
.with_options(options)
.padded_batch(
batch_size=total_batch_size,
drop_remainder=True,
padding_values=(padding_values, np.array(0, dtype=np.int64)),
)
.repeat(int(num_epochs))
)
return tf_dataset
# endregion
def main():
# region Argument Parsing
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
@@ -419,6 +361,14 @@ def main():
train_dataset = processed_raw_datasets["train"]
eval_dataset = processed_raw_datasets["validation"]
if data_args.max_train_samples is not None:
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
if data_args.max_eval_samples is not None:
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
eval_dataset = eval_dataset.select(range(max_eval_samples))
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
@@ -439,43 +389,62 @@ def main():
# endregion
# region Create TF datasets
# We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as
# well as inputs.
collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
num_replicas = training_args.strategy.num_replicas_in_sync
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
train_batches_per_epoch = len(train_dataset) // total_train_batch_size
tf_train_dataset = dataset_to_tf(
dataset_options = tf.data.Options()
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
# yourself if you use this method, whereas they are automatically inferred from the model input names when
# using model.prepare_tf_dataset()
# For more info see the docs:
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
tf_train_dataset = model.prepare_tf_dataset(
train_dataset,
tokenizer,
total_batch_size=total_train_batch_size,
num_epochs=training_args.num_train_epochs,
collate_fn=collate_fn,
batch_size=total_train_batch_size,
shuffle=True,
)
).with_options(dataset_options)
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
eval_batches_per_epoch = len(eval_dataset) // total_eval_batch_size
tf_eval_dataset = dataset_to_tf(
tf_eval_dataset = model.prepare_tf_dataset(
eval_dataset,
tokenizer,
total_batch_size=total_eval_batch_size,
num_epochs=training_args.num_train_epochs,
collate_fn=collate_fn,
batch_size=total_eval_batch_size,
shuffle=False,
)
).with_options(dataset_options)
# endregion
# region Optimizer, loss and compilation
num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
if training_args.warmup_steps > 0:
num_warmup_steps = training_args.warmup_steps
elif training_args.warmup_ratio > 0:
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
else:
num_warmup_steps = 0
optimizer, lr_schedule = create_optimizer(
init_lr=training_args.learning_rate,
num_train_steps=int(training_args.num_train_epochs * train_batches_per_epoch),
num_warmup_steps=training_args.warmup_steps,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
adam_beta1=training_args.adam_beta1,
adam_beta2=training_args.adam_beta2,
adam_epsilon=training_args.adam_epsilon,
weight_decay_rate=training_args.weight_decay,
adam_global_clipnorm=training_args.max_grad_norm,
)
def dummy_loss(y_true, y_pred):
return tf.reduce_mean(y_pred)
model.compile(loss={"loss": dummy_loss}, optimizer=optimizer)
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
# endregion
# Metrics
@@ -517,6 +486,39 @@ def main():
# endregion
# region Preparing push_to_hub and model card
push_to_hub_model_id = training_args.push_to_hub_model_id
model_name = model_args.model_name_or_path.split("/")[-1]
if not push_to_hub_model_id:
if data_args.dataset_name is not None:
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
else:
push_to_hub_model_id = f"{model_name}-finetuned-token-classification"
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "token-classification"}
if data_args.dataset_name is not None:
model_card_kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
model_card_kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
callbacks = [
PushToHubCallback(
output_dir=training_args.output_dir,
model_id=push_to_hub_model_id,
organization=training_args.push_to_hub_organization,
token=training_args.push_to_hub_token,
tokenizer=tokenizer,
**model_card_kwargs,
)
]
else:
callbacks = []
# endregion
# region Training
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
@@ -524,23 +526,43 @@ def main():
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
logger.info(f" Total train batch size = {total_train_batch_size}")
# Only show the progress bar once on each machine.
model.fit(
tf_train_dataset,
validation_data=tf_eval_dataset,
epochs=int(training_args.num_train_epochs),
steps_per_epoch=train_batches_per_epoch,
validation_steps=eval_batches_per_epoch,
callbacks=callbacks,
)
# endregion
# region Predictions
# For predictions, we preload the entire validation set - note that if you have a really giant validation
# set, you might need to change this!
eval_inputs = {key: tf.ragged.constant(eval_dataset[key]).to_tensor() for key in eval_dataset.features}
predictions = model.predict(eval_inputs, batch_size=training_args.per_device_eval_batch_size)["logits"]
predictions = tf.math.argmax(predictions, axis=-1)
labels = np.array(eval_inputs["labels"])
labels[np.array(eval_inputs["attention_mask"]) == 0] = -100
# If you have variable batch sizes (i.e. not using pad_to_max_length), then
# this bit might fail on TF < 2.8 because TF can't concatenate outputs of varying seq
# length from predict().
try:
predictions = model.predict(tf_eval_dataset, batch_size=training_args.per_device_eval_batch_size)["logits"]
except tf.python.framework.errors_impl.InvalidArgumentError:
raise ValueError(
"Concatenating predictions failed! If your version of TensorFlow is 2.8.0 or older "
"then you will need to use --pad_to_max_length to generate predictions, as older "
"versions of TensorFlow cannot concatenate variable-length predictions as RaggedTensor."
)
if isinstance(predictions, tf.RaggedTensor):
predictions = predictions.to_tensor(default_value=-100)
predictions = tf.math.argmax(predictions, axis=-1).numpy()
if "label" in eval_dataset:
labels = eval_dataset.with_format("tf")["label"]
else:
labels = eval_dataset.with_format("tf")["labels"]
if isinstance(labels, tf.RaggedTensor):
labels = labels.to_tensor(default_value=-100)
labels = labels.numpy()
attention_mask = eval_dataset.with_format("tf")["attention_mask"]
if isinstance(attention_mask, tf.RaggedTensor):
attention_mask = attention_mask.to_tensor(default_value=-100)
attention_mask = attention_mask.numpy()
labels[attention_mask == 0] = -100
preds, refs = get_labels(predictions, labels)
metric.add_batch(
predictions=preds,
@@ -550,12 +572,15 @@ def main():
logger.info("Evaluation metrics:")
for key, val in eval_metric.items():
logger.info(f"{key}: {val:.4f}")
if training_args.output_dir is not None:
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
with open(output_eval_file, "w") as writer:
writer.write(json.dumps(eval_metric))
# endregion
# We don't do predictions in the strategy scope because there are some issues in there right now.
# They'll get fixed eventually, promise!
if training_args.output_dir is not None:
if training_args.output_dir is not None and not training_args.push_to_hub:
# If we're not pushing to hub, at least save a local copy when we're done
model.save_pretrained(training_args.output_dir)