TF Examples Rewrite (#18451)

* Finished QA example

* Dodge a merge conflict

* Update text classification and LM examples

* Update NER example

* New Keras metrics WIP, fix NER example

* Update NER example

* Update MC, summarization and translation examples

* Add XLA warnings when shapes are variable

* Make sure batch_size is consistently scaled by num_replicas

* Add PushToHubCallback to all models

* Add docs links for KerasMetricCallback

* Add docs links for prepare_tf_dataset and jit_compile

* Correct inferred model names

* Don't assume the dataset has 'lang'

* Don't assume the dataset has 'lang'

* Write metrics in text classification

* Add 'framework' to TrainingArguments and TFTrainingArguments

* Export metrics in all examples and add tests

* Fix training args for Flax

* Update command line args for translation test

* make fixup

* Fix accidentally running other tests in fp16

* Remove do_train/do_eval from run_clm.py

* Remove do_train/do_eval from run_mlm.py

* Add tensorflow tests to circleci

* Fix circleci

* Update examples/tensorflow/language-modeling/run_mlm.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/test_tensorflow_examples.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/translation/run_translation.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update examples/tensorflow/token-classification/run_ner.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Fix save path for tests

* Fix some model card kwargs

* Explain the magical -1000

* Actually enable tests this time

* Skip text classification PR until we fix shape inference

* make fixup

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
Matt
2022-08-10 11:49:51 -04:00
committed by GitHub
parent d7e2d7b40b
commit 6eb51450fa
15 changed files with 1490 additions and 660 deletions

View File

@@ -18,6 +18,7 @@ Fine-tuning the library models for question answering.
"""
# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
import json
import logging
import os
import sys
@@ -33,13 +34,13 @@ import transformers
from transformers import (
AutoConfig,
AutoTokenizer,
DataCollatorWithPadding,
DefaultDataCollator,
EvalPrediction,
HfArgumentParser,
PreTrainedTokenizerFast,
PushToHubCallback,
TFAutoModelForQuestionAnswering,
TFTrainingArguments,
create_optimizer,
set_seed,
)
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
@@ -609,7 +610,12 @@ def main():
# endregion
with training_args.strategy.scope():
# region Load model
dataset_options = tf.data.Options()
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
num_replicas = training_args.strategy.num_replicas_in_sync
# region Load model and prepare datasets
if checkpoint is None:
model_path = model_args.model_name_or_path
else:
@@ -621,71 +627,163 @@ def main():
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
optimizer = tf.keras.optimizers.Adam(
learning_rate=training_args.learning_rate,
beta_1=training_args.adam_beta1,
beta_2=training_args.adam_beta2,
epsilon=training_args.adam_epsilon,
clipnorm=training_args.max_grad_norm,
)
if training_args.do_train:
training_dataset = model.prepare_tf_dataset(
processed_datasets["train"],
shuffle=True,
batch_size=training_args.per_device_train_batch_size * num_replicas,
tokenizer=tokenizer,
)
training_dataset = training_dataset.with_options(dataset_options)
num_train_steps = len(training_dataset) * training_args.num_train_epochs
if training_args.warmup_steps > 0:
num_warmup_steps = training_args.warmup_steps
elif training_args.warmup_ratio > 0:
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
else:
num_warmup_steps = 0
optimizer, schedule = create_optimizer(
init_lr=training_args.learning_rate,
num_train_steps=len(training_dataset) * training_args.num_train_epochs,
num_warmup_steps=num_warmup_steps,
adam_beta1=training_args.adam_beta1,
adam_beta2=training_args.adam_beta2,
adam_epsilon=training_args.adam_epsilon,
weight_decay_rate=training_args.weight_decay,
adam_global_clipnorm=training_args.max_grad_norm,
)
# no user-specified loss = will use the model internal loss
model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"])
else:
model.compile(optimizer=None, jit_compile=training_args.xla, metrics=["accuracy"])
training_dataset = None
if training_args.do_eval:
eval_dataset = model.prepare_tf_dataset(
processed_datasets["validation"],
shuffle=False,
batch_size=training_args.per_device_train_batch_size * num_replicas,
tokenizer=tokenizer,
)
eval_dataset = eval_dataset.with_options(dataset_options)
else:
eval_dataset = None
if training_args.do_predict:
predict_dataset = model.prepare_tf_dataset(
processed_datasets["test"],
shuffle=False,
batch_size=training_args.per_device_eval_batch_size * num_replicas,
tokenizer=tokenizer,
)
predict_dataset = predict_dataset.with_options(dataset_options)
else:
predict_dataset = None
# no user-specified loss = will use the model internal loss
model.compile(optimizer=optimizer)
# endregion
# region Training
if padding:
data_collator = DefaultDataCollator(return_tensors="tf")
# region Preparing push_to_hub and model card
push_to_hub_model_id = training_args.push_to_hub_model_id
model_name = model_args.model_name_or_path.split("/")[-1]
if not push_to_hub_model_id:
if data_args.dataset_name is not None:
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
else:
push_to_hub_model_id = f"{model_name}-finetuned-question-answering"
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
if data_args.dataset_name is not None:
model_card_kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
model_card_kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
callbacks = [
PushToHubCallback(
output_dir=training_args.output_dir,
model_id=push_to_hub_model_id,
organization=training_args.push_to_hub_organization,
token=training_args.push_to_hub_token,
tokenizer=tokenizer,
**model_card_kwargs,
)
]
else:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
tensor_keys = ["attention_mask", "input_ids"]
label_keys = ["start_positions", "end_positions"]
callbacks = []
# endregion
# region Training and Evaluation
if training_args.do_train:
# Make a tf.data.Dataset for this
training_dataset = processed_datasets["train"].to_tf_dataset(
# labels are passed as input, as we will use the model's internal loss
columns=tensor_keys + label_keys,
shuffle=True,
batch_size=training_args.per_device_train_batch_size,
collate_fn=data_collator,
drop_remainder=True,
)
model.fit(training_dataset, epochs=int(training_args.num_train_epochs))
# endregion
# Note that the validation and test datasets have been processed in a different way to the
# training datasets in this example, and so they don't have the same label structure.
# As such, we don't pass them directly to Keras, but instead get model predictions to evaluate
# after training.
model.fit(training_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
# region Evaluation
if training_args.do_eval:
logger.info("*** Evaluation ***")
eval_inputs = {
"input_ids": tf.ragged.constant(processed_datasets["validation"]["input_ids"]).to_tensor(),
"attention_mask": tf.ragged.constant(processed_datasets["validation"]["attention_mask"]).to_tensor(),
}
eval_predictions = model.predict(eval_inputs)
# In this example, we compute advanced metrics at the end of training, but
# if you'd like to compute metrics every epoch that are too complex to be written as
# standard Keras metrics, you can use our KerasMetricCallback. See
# https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
eval_predictions = model.predict(eval_dataset)
if isinstance(eval_predictions.start_logits, tf.RaggedTensor):
# If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
# The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
# the highest probability in a sample. Instead, we use a large negative value, which ensures that the
# padding positions are correctly masked.
eval_start_logits = eval_predictions.start_logits.to_tensor(default_value=-1000).numpy()
eval_end_logits = eval_predictions.end_logits.to_tensor(default_value=-1000).numpy()
else:
eval_start_logits = eval_predictions.start_logits
eval_end_logits = eval_predictions.end_logits
post_processed_eval = post_processing_function(
datasets["validation"],
processed_datasets["validation"],
(eval_predictions.start_logits, eval_predictions.end_logits),
(eval_start_logits, eval_end_logits),
)
metrics = compute_metrics(post_processed_eval)
logging.info("Evaluation metrics:")
for metric, value in metrics.items():
logging.info(f"{metric}: {value:.3f}")
if training_args.output_dir is not None:
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
with open(output_eval_file, "w") as writer:
writer.write(json.dumps(metrics))
# endregion
# region Prediction
if training_args.do_predict:
logger.info("*** Predict ***")
predict_inputs = {
"input_ids": tf.ragged.constant(processed_datasets["test"]["input_ids"]).to_tensor(),
"attention_mask": tf.ragged.constant(processed_datasets["test"]["attention_mask"]).to_tensor(),
}
test_predictions = model.predict(predict_inputs)
test_predictions = model.predict(predict_dataset)
if isinstance(test_predictions.start_logits, tf.RaggedTensor):
# If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
# The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
# the highest probability in a sample. Instead, we use a large negative value, which ensures that the
# padding positions are correctly masked.
test_start_logits = test_predictions.start_logits.to_tensor(default_value=-1000).numpy()
test_end_logits = test_predictions.end_logits.to_tensor(default_value=-1000).numpy()
else:
test_start_logits = test_predictions.start_logits
test_end_logits = test_predictions.end_logits
post_processed_test = post_processing_function(
datasets["test"],
processed_datasets["test"],
(test_predictions.start_logits, test_predictions.end_logits),
(test_start_logits, test_end_logits),
)
metrics = compute_metrics(post_processed_test)
@@ -694,8 +792,9 @@ def main():
logging.info(f"{metric}: {value:.3f}")
# endregion
if training_args.push_to_hub:
model.push_to_hub()
if training_args.output_dir is not None and not training_args.push_to_hub:
# If we're not pushing to hub, at least save a local copy when we're done
model.save_pretrained(training_args.output_dir)
if __name__ == "__main__":