TF Examples Rewrite (#18451)
* Finished QA example * Dodge a merge conflict * Update text classification and LM examples * Update NER example * New Keras metrics WIP, fix NER example * Update NER example * Update MC, summarization and translation examples * Add XLA warnings when shapes are variable * Make sure batch_size is consistently scaled by num_replicas * Add PushToHubCallback to all models * Add docs links for KerasMetricCallback * Add docs links for prepare_tf_dataset and jit_compile * Correct inferred model names * Don't assume the dataset has 'lang' * Don't assume the dataset has 'lang' * Write metrics in text classification * Add 'framework' to TrainingArguments and TFTrainingArguments * Export metrics in all examples and add tests * Fix training args for Flax * Update command line args for translation test * make fixup * Fix accidentally running other tests in fp16 * Remove do_train/do_eval from run_clm.py * Remove do_train/do_eval from run_mlm.py * Add tensorflow tests to circleci * Fix circleci * Update examples/tensorflow/language-modeling/run_mlm.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/test_tensorflow_examples.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/translation/run_translation.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/token-classification/run_ner.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Fix save path for tests * Fix some model card kwargs * Explain the magical -1000 * Actually enable tests this time * Skip text classification PR until we fix shape inference * make fixup Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
@@ -18,6 +18,7 @@ Fine-tuning the library models for question answering.
|
||||
"""
|
||||
# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
@@ -33,13 +34,13 @@ import transformers
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
DataCollatorWithPadding,
|
||||
DefaultDataCollator,
|
||||
EvalPrediction,
|
||||
HfArgumentParser,
|
||||
PreTrainedTokenizerFast,
|
||||
PushToHubCallback,
|
||||
TFAutoModelForQuestionAnswering,
|
||||
TFTrainingArguments,
|
||||
create_optimizer,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
|
||||
@@ -609,7 +610,12 @@ def main():
|
||||
# endregion
|
||||
|
||||
with training_args.strategy.scope():
|
||||
# region Load model
|
||||
|
||||
dataset_options = tf.data.Options()
|
||||
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||
|
||||
# region Load model and prepare datasets
|
||||
if checkpoint is None:
|
||||
model_path = model_args.model_name_or_path
|
||||
else:
|
||||
@@ -621,71 +627,163 @@ def main():
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
optimizer = tf.keras.optimizers.Adam(
|
||||
learning_rate=training_args.learning_rate,
|
||||
beta_1=training_args.adam_beta1,
|
||||
beta_2=training_args.adam_beta2,
|
||||
epsilon=training_args.adam_epsilon,
|
||||
clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
if training_args.do_train:
|
||||
|
||||
training_dataset = model.prepare_tf_dataset(
|
||||
processed_datasets["train"],
|
||||
shuffle=True,
|
||||
batch_size=training_args.per_device_train_batch_size * num_replicas,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
training_dataset = training_dataset.with_options(dataset_options)
|
||||
|
||||
num_train_steps = len(training_dataset) * training_args.num_train_epochs
|
||||
if training_args.warmup_steps > 0:
|
||||
num_warmup_steps = training_args.warmup_steps
|
||||
elif training_args.warmup_ratio > 0:
|
||||
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||
else:
|
||||
num_warmup_steps = 0
|
||||
|
||||
optimizer, schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=len(training_dataset) * training_args.num_train_epochs,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
adam_beta1=training_args.adam_beta1,
|
||||
adam_beta2=training_args.adam_beta2,
|
||||
adam_epsilon=training_args.adam_epsilon,
|
||||
weight_decay_rate=training_args.weight_decay,
|
||||
adam_global_clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
|
||||
# no user-specified loss = will use the model internal loss
|
||||
model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"])
|
||||
|
||||
else:
|
||||
model.compile(optimizer=None, jit_compile=training_args.xla, metrics=["accuracy"])
|
||||
training_dataset = None
|
||||
|
||||
if training_args.do_eval:
|
||||
eval_dataset = model.prepare_tf_dataset(
|
||||
processed_datasets["validation"],
|
||||
shuffle=False,
|
||||
batch_size=training_args.per_device_train_batch_size * num_replicas,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
eval_dataset = eval_dataset.with_options(dataset_options)
|
||||
else:
|
||||
eval_dataset = None
|
||||
|
||||
if training_args.do_predict:
|
||||
predict_dataset = model.prepare_tf_dataset(
|
||||
processed_datasets["test"],
|
||||
shuffle=False,
|
||||
batch_size=training_args.per_device_eval_batch_size * num_replicas,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
predict_dataset = predict_dataset.with_options(dataset_options)
|
||||
else:
|
||||
predict_dataset = None
|
||||
|
||||
# no user-specified loss = will use the model internal loss
|
||||
model.compile(optimizer=optimizer)
|
||||
# endregion
|
||||
|
||||
# region Training
|
||||
if padding:
|
||||
data_collator = DefaultDataCollator(return_tensors="tf")
|
||||
# region Preparing push_to_hub and model card
|
||||
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||
if not push_to_hub_model_id:
|
||||
if data_args.dataset_name is not None:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||
else:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-question-answering"
|
||||
|
||||
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
|
||||
if data_args.dataset_name is not None:
|
||||
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||
if data_args.dataset_config_name is not None:
|
||||
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||
else:
|
||||
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||
|
||||
if training_args.push_to_hub:
|
||||
callbacks = [
|
||||
PushToHubCallback(
|
||||
output_dir=training_args.output_dir,
|
||||
model_id=push_to_hub_model_id,
|
||||
organization=training_args.push_to_hub_organization,
|
||||
token=training_args.push_to_hub_token,
|
||||
tokenizer=tokenizer,
|
||||
**model_card_kwargs,
|
||||
)
|
||||
]
|
||||
else:
|
||||
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
|
||||
tensor_keys = ["attention_mask", "input_ids"]
|
||||
label_keys = ["start_positions", "end_positions"]
|
||||
callbacks = []
|
||||
# endregion
|
||||
|
||||
# region Training and Evaluation
|
||||
|
||||
if training_args.do_train:
|
||||
# Make a tf.data.Dataset for this
|
||||
training_dataset = processed_datasets["train"].to_tf_dataset(
|
||||
# labels are passed as input, as we will use the model's internal loss
|
||||
columns=tensor_keys + label_keys,
|
||||
shuffle=True,
|
||||
batch_size=training_args.per_device_train_batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=True,
|
||||
)
|
||||
model.fit(training_dataset, epochs=int(training_args.num_train_epochs))
|
||||
# endregion
|
||||
# Note that the validation and test datasets have been processed in a different way to the
|
||||
# training datasets in this example, and so they don't have the same label structure.
|
||||
# As such, we don't pass them directly to Keras, but instead get model predictions to evaluate
|
||||
# after training.
|
||||
model.fit(training_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
|
||||
|
||||
# region Evaluation
|
||||
if training_args.do_eval:
|
||||
logger.info("*** Evaluation ***")
|
||||
eval_inputs = {
|
||||
"input_ids": tf.ragged.constant(processed_datasets["validation"]["input_ids"]).to_tensor(),
|
||||
"attention_mask": tf.ragged.constant(processed_datasets["validation"]["attention_mask"]).to_tensor(),
|
||||
}
|
||||
eval_predictions = model.predict(eval_inputs)
|
||||
|
||||
# In this example, we compute advanced metrics at the end of training, but
|
||||
# if you'd like to compute metrics every epoch that are too complex to be written as
|
||||
# standard Keras metrics, you can use our KerasMetricCallback. See
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
|
||||
|
||||
eval_predictions = model.predict(eval_dataset)
|
||||
if isinstance(eval_predictions.start_logits, tf.RaggedTensor):
|
||||
# If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
|
||||
# The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
|
||||
# the highest probability in a sample. Instead, we use a large negative value, which ensures that the
|
||||
# padding positions are correctly masked.
|
||||
eval_start_logits = eval_predictions.start_logits.to_tensor(default_value=-1000).numpy()
|
||||
eval_end_logits = eval_predictions.end_logits.to_tensor(default_value=-1000).numpy()
|
||||
else:
|
||||
eval_start_logits = eval_predictions.start_logits
|
||||
eval_end_logits = eval_predictions.end_logits
|
||||
|
||||
post_processed_eval = post_processing_function(
|
||||
datasets["validation"],
|
||||
processed_datasets["validation"],
|
||||
(eval_predictions.start_logits, eval_predictions.end_logits),
|
||||
(eval_start_logits, eval_end_logits),
|
||||
)
|
||||
metrics = compute_metrics(post_processed_eval)
|
||||
logging.info("Evaluation metrics:")
|
||||
for metric, value in metrics.items():
|
||||
logging.info(f"{metric}: {value:.3f}")
|
||||
if training_args.output_dir is not None:
|
||||
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write(json.dumps(metrics))
|
||||
# endregion
|
||||
|
||||
# region Prediction
|
||||
if training_args.do_predict:
|
||||
logger.info("*** Predict ***")
|
||||
predict_inputs = {
|
||||
"input_ids": tf.ragged.constant(processed_datasets["test"]["input_ids"]).to_tensor(),
|
||||
"attention_mask": tf.ragged.constant(processed_datasets["test"]["attention_mask"]).to_tensor(),
|
||||
}
|
||||
test_predictions = model.predict(predict_inputs)
|
||||
|
||||
test_predictions = model.predict(predict_dataset)
|
||||
if isinstance(test_predictions.start_logits, tf.RaggedTensor):
|
||||
# If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
|
||||
# The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
|
||||
# the highest probability in a sample. Instead, we use a large negative value, which ensures that the
|
||||
# padding positions are correctly masked.
|
||||
test_start_logits = test_predictions.start_logits.to_tensor(default_value=-1000).numpy()
|
||||
test_end_logits = test_predictions.end_logits.to_tensor(default_value=-1000).numpy()
|
||||
else:
|
||||
test_start_logits = test_predictions.start_logits
|
||||
test_end_logits = test_predictions.end_logits
|
||||
post_processed_test = post_processing_function(
|
||||
datasets["test"],
|
||||
processed_datasets["test"],
|
||||
(test_predictions.start_logits, test_predictions.end_logits),
|
||||
(test_start_logits, test_end_logits),
|
||||
)
|
||||
metrics = compute_metrics(post_processed_test)
|
||||
|
||||
@@ -694,8 +792,9 @@ def main():
|
||||
logging.info(f"{metric}: {value:.3f}")
|
||||
# endregion
|
||||
|
||||
if training_args.push_to_hub:
|
||||
model.push_to_hub()
|
||||
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||
# If we're not pushing to hub, at least save a local copy when we're done
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user