TF Examples Rewrite (#18451)

* Finished QA example * Dodge a merge conflict * Update text classification and LM examples * Update NER example * New Keras metrics WIP, fix NER example * Update NER example * Update MC, summarization and translation examples * Add XLA warnings when shapes are variable * Make sure batch_size is consistently scaled by num_replicas * Add PushToHubCallback to all models * Add docs links for KerasMetricCallback * Add docs links for prepare_tf_dataset and jit_compile * Correct inferred model names * Don't assume the dataset has 'lang' * Don't assume the dataset has 'lang' * Write metrics in text classification * Add 'framework' to TrainingArguments and TFTrainingArguments * Export metrics in all examples and add tests * Fix training args for Flax * Update command line args for translation test * make fixup * Fix accidentally running other tests in fp16 * Remove do_train/do_eval from run_clm.py * Remove do_train/do_eval from run_mlm.py * Add tensorflow tests to circleci * Fix circleci * Update examples/tensorflow/language-modeling/run_mlm.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/test_tensorflow_examples.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/translation/run_translation.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/token-classification/run_ner.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Fix save path for tests * Fix some model card kwargs * Explain the magical -1000 * Actually enable tests this time * Skip text classification PR until we fix shape inference * make fixup Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
2022-08-10 11:49:51 -04:00
parent d7e2d7b40b
commit 6eb51450fa
15 changed files with 1490 additions and 660 deletions
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -18,6 +18,7 @@ Fine-tuning the library models for question answering.
 """
 # You can also adapt this script on your own question answering task. Pointers for this are left as comments.

+import json
 import logging
 import os
 import sys
@@ -33,13 +34,13 @@ import transformers
 from transformers import (
    AutoConfig,
    AutoTokenizer,
-    DataCollatorWithPadding,
-    DefaultDataCollator,
    EvalPrediction,
    HfArgumentParser,
    PreTrainedTokenizerFast,
+    PushToHubCallback,
    TFAutoModelForQuestionAnswering,
    TFTrainingArguments,
+    create_optimizer,
    set_seed,
 )
 from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
@@ -609,7 +610,12 @@ def main():
    # endregion

    with training_args.strategy.scope():
-        # region Load model
+
+        dataset_options = tf.data.Options()
+        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+        num_replicas = training_args.strategy.num_replicas_in_sync
+
+        # region Load model and prepare datasets
        if checkpoint is None:
            model_path = model_args.model_name_or_path
        else:
@@ -621,71 +627,163 @@ def main():
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
-        optimizer = tf.keras.optimizers.Adam(
-            learning_rate=training_args.learning_rate,
-            beta_1=training_args.adam_beta1,
-            beta_2=training_args.adam_beta2,
-            epsilon=training_args.adam_epsilon,
-            clipnorm=training_args.max_grad_norm,
-        )
+        if training_args.do_train:
+
+            training_dataset = model.prepare_tf_dataset(
+                processed_datasets["train"],
+                shuffle=True,
+                batch_size=training_args.per_device_train_batch_size * num_replicas,
+                tokenizer=tokenizer,
+            )
+
+            training_dataset = training_dataset.with_options(dataset_options)
+
+            num_train_steps = len(training_dataset) * training_args.num_train_epochs
+            if training_args.warmup_steps > 0:
+                num_warmup_steps = training_args.warmup_steps
+            elif training_args.warmup_ratio > 0:
+                num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+            else:
+                num_warmup_steps = 0
+
+            optimizer, schedule = create_optimizer(
+                init_lr=training_args.learning_rate,
+                num_train_steps=len(training_dataset) * training_args.num_train_epochs,
+                num_warmup_steps=num_warmup_steps,
+                adam_beta1=training_args.adam_beta1,
+                adam_beta2=training_args.adam_beta2,
+                adam_epsilon=training_args.adam_epsilon,
+                weight_decay_rate=training_args.weight_decay,
+                adam_global_clipnorm=training_args.max_grad_norm,
+            )
+
+            # no user-specified loss = will use the model internal loss
+            model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"])
+
+        else:
+            model.compile(optimizer=None, jit_compile=training_args.xla, metrics=["accuracy"])
+            training_dataset = None
+
+        if training_args.do_eval:
+            eval_dataset = model.prepare_tf_dataset(
+                processed_datasets["validation"],
+                shuffle=False,
+                batch_size=training_args.per_device_train_batch_size * num_replicas,
+                tokenizer=tokenizer,
+            )
+            eval_dataset = eval_dataset.with_options(dataset_options)
+        else:
+            eval_dataset = None
+
+        if training_args.do_predict:
+            predict_dataset = model.prepare_tf_dataset(
+                processed_datasets["test"],
+                shuffle=False,
+                batch_size=training_args.per_device_eval_batch_size * num_replicas,
+                tokenizer=tokenizer,
+            )
+            predict_dataset = predict_dataset.with_options(dataset_options)
+        else:
+            predict_dataset = None

-        # no user-specified loss = will use the model internal loss
-        model.compile(optimizer=optimizer)
        # endregion

-        # region Training
-        if padding:
-            data_collator = DefaultDataCollator(return_tensors="tf")
+        # region Preparing push_to_hub and model card
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        model_name = model_args.model_name_or_path.split("/")[-1]
+        if not push_to_hub_model_id:
+            if data_args.dataset_name is not None:
+                push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
+            else:
+                push_to_hub_model_id = f"{model_name}-finetuned-question-answering"
+
+        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
+        if data_args.dataset_name is not None:
+            model_card_kwargs["dataset_tags"] = data_args.dataset_name
+            if data_args.dataset_config_name is not None:
+                model_card_kwargs["dataset_args"] = data_args.dataset_config_name
+                model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+            else:
+                model_card_kwargs["dataset"] = data_args.dataset_name
+
+        if training_args.push_to_hub:
+            callbacks = [
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    model_id=push_to_hub_model_id,
+                    organization=training_args.push_to_hub_organization,
+                    token=training_args.push_to_hub_token,
+                    tokenizer=tokenizer,
+                    **model_card_kwargs,
+                )
+            ]
        else:
-            data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
-        tensor_keys = ["attention_mask", "input_ids"]
-        label_keys = ["start_positions", "end_positions"]
+            callbacks = []
+        # endregion
+
+        # region Training and Evaluation

        if training_args.do_train:
-            # Make a tf.data.Dataset for this
-            training_dataset = processed_datasets["train"].to_tf_dataset(
-                # labels are passed as input, as we will use the model's internal loss
-                columns=tensor_keys + label_keys,
-                shuffle=True,
-                batch_size=training_args.per_device_train_batch_size,
-                collate_fn=data_collator,
-                drop_remainder=True,
-            )
-            model.fit(training_dataset, epochs=int(training_args.num_train_epochs))
-        # endregion
+            # Note that the validation and test datasets have been processed in a different way to the
+            # training datasets in this example, and so they don't have the same label structure.
+            # As such, we don't pass them directly to Keras, but instead get model predictions to evaluate
+            # after training.
+            model.fit(training_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)

-        # region Evaluation
        if training_args.do_eval:
            logger.info("*** Evaluation ***")
-            eval_inputs = {
-                "input_ids": tf.ragged.constant(processed_datasets["validation"]["input_ids"]).to_tensor(),
-                "attention_mask": tf.ragged.constant(processed_datasets["validation"]["attention_mask"]).to_tensor(),
-            }
-            eval_predictions = model.predict(eval_inputs)
+
+            # In this example, we compute advanced metrics at the end of training, but
+            # if you'd like to compute metrics every epoch that are too complex to be written as
+            # standard Keras metrics, you can use our KerasMetricCallback. See
+            # https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
+
+            eval_predictions = model.predict(eval_dataset)
+            if isinstance(eval_predictions.start_logits, tf.RaggedTensor):
+                # If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
+                # The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
+                # the highest probability in a sample. Instead, we use a large negative value, which ensures that the
+                # padding positions are correctly masked.
+                eval_start_logits = eval_predictions.start_logits.to_tensor(default_value=-1000).numpy()
+                eval_end_logits = eval_predictions.end_logits.to_tensor(default_value=-1000).numpy()
+            else:
+                eval_start_logits = eval_predictions.start_logits
+                eval_end_logits = eval_predictions.end_logits

            post_processed_eval = post_processing_function(
                datasets["validation"],
                processed_datasets["validation"],
-                (eval_predictions.start_logits, eval_predictions.end_logits),
+                (eval_start_logits, eval_end_logits),
            )
            metrics = compute_metrics(post_processed_eval)
            logging.info("Evaluation metrics:")
            for metric, value in metrics.items():
                logging.info(f"{metric}: {value:.3f}")
+            if training_args.output_dir is not None:
+                output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
+                with open(output_eval_file, "w") as writer:
+                    writer.write(json.dumps(metrics))
        # endregion

        # region Prediction
        if training_args.do_predict:
            logger.info("*** Predict ***")
-            predict_inputs = {
-                "input_ids": tf.ragged.constant(processed_datasets["test"]["input_ids"]).to_tensor(),
-                "attention_mask": tf.ragged.constant(processed_datasets["test"]["attention_mask"]).to_tensor(),
-            }
-            test_predictions = model.predict(predict_inputs)
+
+            test_predictions = model.predict(predict_dataset)
+            if isinstance(test_predictions.start_logits, tf.RaggedTensor):
+                # If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
+                # The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
+                # the highest probability in a sample. Instead, we use a large negative value, which ensures that the
+                # padding positions are correctly masked.
+                test_start_logits = test_predictions.start_logits.to_tensor(default_value=-1000).numpy()
+                test_end_logits = test_predictions.end_logits.to_tensor(default_value=-1000).numpy()
+            else:
+                test_start_logits = test_predictions.start_logits
+                test_end_logits = test_predictions.end_logits
            post_processed_test = post_processing_function(
                datasets["test"],
                processed_datasets["test"],
-                (test_predictions.start_logits, test_predictions.end_logits),
+                (test_start_logits, test_end_logits),
            )
            metrics = compute_metrics(post_processed_test)

@@ -694,8 +792,9 @@ def main():
                logging.info(f"{metric}: {value:.3f}")
        # endregion

-    if training_args.push_to_hub:
-        model.push_to_hub()
+    if training_args.output_dir is not None and not training_args.push_to_hub:
+        # If we're not pushing to hub, at least save a local copy when we're done
+        model.save_pretrained(training_args.output_dir)


 if __name__ == "__main__":