TF Examples Rewrite (#18451)

* Finished QA example * Dodge a merge conflict * Update text classification and LM examples * Update NER example * New Keras metrics WIP, fix NER example * Update NER example * Update MC, summarization and translation examples * Add XLA warnings when shapes are variable * Make sure batch_size is consistently scaled by num_replicas * Add PushToHubCallback to all models * Add docs links for KerasMetricCallback * Add docs links for prepare_tf_dataset and jit_compile * Correct inferred model names * Don't assume the dataset has 'lang' * Don't assume the dataset has 'lang' * Write metrics in text classification * Add 'framework' to TrainingArguments and TFTrainingArguments * Export metrics in all examples and add tests * Fix training args for Flax * Update command line args for translation test * make fixup * Fix accidentally running other tests in fp16 * Remove do_train/do_eval from run_clm.py * Remove do_train/do_eval from run_mlm.py * Add tensorflow tests to circleci * Fix circleci * Update examples/tensorflow/language-modeling/run_mlm.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/test_tensorflow_examples.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/translation/run_translation.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/token-classification/run_ner.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Fix save path for tests * Fix some model card kwargs * Explain the magical -1000 * Actually enable tests this time * Skip text classification PR until we fix shape inference * make fixup Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
2022-08-10 11:49:51 -04:00
parent d7e2d7b40b
commit 6eb51450fa
15 changed files with 1490 additions and 660 deletions
--- a/examples/tensorflow/token-classification/run_ner.py
+++ b/examples/tensorflow/token-classification/run_ner.py
@@ -18,14 +18,14 @@ Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, C
 without using a Trainer.
 """

+import json
 import logging
+import os
 import random
 from dataclasses import dataclass, field
-from functools import partial
 from typing import Optional

 import datasets
-import numpy as np
 import tensorflow as tf
 from datasets import ClassLabel, load_dataset

@@ -33,10 +33,11 @@ import evaluate
 import transformers
 from transformers import (
    CONFIG_MAPPING,
-    MODEL_MAPPING,
    AutoConfig,
    AutoTokenizer,
+    DataCollatorForTokenClassification,
    HfArgumentParser,
+    PushToHubCallback,
    TFAutoModelForTokenClassification,
    TFTrainingArguments,
    create_optimizer,
@@ -48,11 +49,7 @@ from transformers.utils.versions import require_version

 logger = logging.getLogger(__name__)
 logger.addHandler(logging.StreamHandler())
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
-
-# You should update this to your particular problem to have better documentation of `model_type`
-MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/token-classification/requirements.txt")


 # region Command-line arguments
@@ -195,61 +192,6 @@ class DataTrainingArguments:
 # endregion


-# region Data generator
-def sample_generator(dataset, tokenizer, shuffle, pad_to_multiple_of=None):
-    # Trim off the last partial batch if present
-    if shuffle:
-        sample_ordering = np.random.permutation(len(dataset))
-    else:
-        sample_ordering = np.arange(len(dataset))
-    for sample_idx in sample_ordering:
-        example = dataset[int(sample_idx)]
-        # Handle dicts with proper padding and conversion to tensor.
-        example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
-        if tokenizer.pad_token_id is not None:
-            example["labels"][example["attention_mask"] == 0] = -100
-        example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()}
-
-        yield example, example["labels"]  # TF needs some kind of labels, even if we don't use them
-    return
-
-
-# endregion
-
-
-# region Helper functions
-def dataset_to_tf(dataset, tokenizer, total_batch_size, num_epochs, shuffle):
-    train_generator = partial(sample_generator, dataset, tokenizer, shuffle=shuffle)
-    train_signature = {
-        feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
-        for feature in dataset.features
-        if feature != "special_tokens_mask"
-    }
-    # This may need to be changed depending on your particular model or tokenizer!
-    padding_values = {key: tf.convert_to_tensor(0, dtype=tf.int64) for key in dataset.features}
-    padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int64)
-    if tokenizer.pad_token_id is not None:
-        padding_values["input_ids"] = tf.convert_to_tensor(tokenizer.pad_token_id, dtype=tf.int64)
-    train_signature["labels"] = train_signature["input_ids"]
-    train_signature = (train_signature, train_signature["labels"])
-    options = tf.data.Options()
-    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
-    tf_dataset = (
-        tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
-        .with_options(options)
-        .padded_batch(
-            batch_size=total_batch_size,
-            drop_remainder=True,
-            padding_values=(padding_values, np.array(0, dtype=np.int64)),
-        )
-        .repeat(int(num_epochs))
-    )
-    return tf_dataset
-
-
-# endregion
-
-
 def main():
    # region Argument Parsing
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
@@ -419,6 +361,14 @@ def main():
    train_dataset = processed_raw_datasets["train"]
    eval_dataset = processed_raw_datasets["validation"]

+    if data_args.max_train_samples is not None:
+        max_train_samples = min(len(train_dataset), data_args.max_train_samples)
+        train_dataset = train_dataset.select(range(max_train_samples))
+
+    if data_args.max_eval_samples is not None:
+        max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
+        eval_dataset = eval_dataset.select(range(max_eval_samples))
+
    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
@@ -439,43 +389,62 @@ def main():
        # endregion

        # region Create TF datasets
+
+        # We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as
+        # well as inputs.
+        collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
        num_replicas = training_args.strategy.num_replicas_in_sync
        total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
-        train_batches_per_epoch = len(train_dataset) // total_train_batch_size
-        tf_train_dataset = dataset_to_tf(
+
+        dataset_options = tf.data.Options()
+        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+
+        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
+        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
+        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
+        # yourself if you use this method, whereas they are automatically inferred from the model input names when
+        # using model.prepare_tf_dataset()
+        # For more info see the docs:
+        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
+        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
+
+        tf_train_dataset = model.prepare_tf_dataset(
            train_dataset,
-            tokenizer,
-            total_batch_size=total_train_batch_size,
-            num_epochs=training_args.num_train_epochs,
+            collate_fn=collate_fn,
+            batch_size=total_train_batch_size,
            shuffle=True,
-        )
+        ).with_options(dataset_options)
        total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
-        eval_batches_per_epoch = len(eval_dataset) // total_eval_batch_size
-        tf_eval_dataset = dataset_to_tf(
+        tf_eval_dataset = model.prepare_tf_dataset(
            eval_dataset,
-            tokenizer,
-            total_batch_size=total_eval_batch_size,
-            num_epochs=training_args.num_train_epochs,
+            collate_fn=collate_fn,
+            batch_size=total_eval_batch_size,
            shuffle=False,
-        )
+        ).with_options(dataset_options)

        # endregion

        # region Optimizer, loss and compilation
+        num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
+        if training_args.warmup_steps > 0:
+            num_warmup_steps = training_args.warmup_steps
+        elif training_args.warmup_ratio > 0:
+            num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
+        else:
+            num_warmup_steps = 0
+
        optimizer, lr_schedule = create_optimizer(
            init_lr=training_args.learning_rate,
-            num_train_steps=int(training_args.num_train_epochs * train_batches_per_epoch),
-            num_warmup_steps=training_args.warmup_steps,
+            num_train_steps=num_train_steps,
+            num_warmup_steps=num_warmup_steps,
            adam_beta1=training_args.adam_beta1,
            adam_beta2=training_args.adam_beta2,
            adam_epsilon=training_args.adam_epsilon,
            weight_decay_rate=training_args.weight_decay,
+            adam_global_clipnorm=training_args.max_grad_norm,
        )

-        def dummy_loss(y_true, y_pred):
-            return tf.reduce_mean(y_pred)
-
-        model.compile(loss={"loss": dummy_loss}, optimizer=optimizer)
+        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
        # endregion

        # Metrics
@@ -517,6 +486,39 @@ def main():

        # endregion

+        # region Preparing push_to_hub and model card
+        push_to_hub_model_id = training_args.push_to_hub_model_id
+        model_name = model_args.model_name_or_path.split("/")[-1]
+        if not push_to_hub_model_id:
+            if data_args.dataset_name is not None:
+                push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
+            else:
+                push_to_hub_model_id = f"{model_name}-finetuned-token-classification"
+
+        model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "token-classification"}
+        if data_args.dataset_name is not None:
+            model_card_kwargs["dataset_tags"] = data_args.dataset_name
+            if data_args.dataset_config_name is not None:
+                model_card_kwargs["dataset_args"] = data_args.dataset_config_name
+                model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+            else:
+                model_card_kwargs["dataset"] = data_args.dataset_name
+
+        if training_args.push_to_hub:
+            callbacks = [
+                PushToHubCallback(
+                    output_dir=training_args.output_dir,
+                    model_id=push_to_hub_model_id,
+                    organization=training_args.push_to_hub_organization,
+                    token=training_args.push_to_hub_token,
+                    tokenizer=tokenizer,
+                    **model_card_kwargs,
+                )
+            ]
+        else:
+            callbacks = []
+        # endregion
+
        # region Training
        logger.info("***** Running training *****")
        logger.info(f"  Num examples = {len(train_dataset)}")
@@ -524,23 +526,43 @@ def main():
        logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
        logger.info(f"  Total train batch size = {total_train_batch_size}")
        # Only show the progress bar once on each machine.
+
        model.fit(
            tf_train_dataset,
            validation_data=tf_eval_dataset,
            epochs=int(training_args.num_train_epochs),
-            steps_per_epoch=train_batches_per_epoch,
-            validation_steps=eval_batches_per_epoch,
+            callbacks=callbacks,
        )
        # endregion

        # region Predictions
-        # For predictions, we preload the entire validation set - note that if you have a really giant validation
-        # set, you might need to change this!
-        eval_inputs = {key: tf.ragged.constant(eval_dataset[key]).to_tensor() for key in eval_dataset.features}
-        predictions = model.predict(eval_inputs, batch_size=training_args.per_device_eval_batch_size)["logits"]
-        predictions = tf.math.argmax(predictions, axis=-1)
-        labels = np.array(eval_inputs["labels"])
-        labels[np.array(eval_inputs["attention_mask"]) == 0] = -100
+        # If you have variable batch sizes (i.e. not using pad_to_max_length), then
+        # this bit might fail on TF < 2.8 because TF can't concatenate outputs of varying seq
+        # length from predict().
+
+        try:
+            predictions = model.predict(tf_eval_dataset, batch_size=training_args.per_device_eval_batch_size)["logits"]
+        except tf.python.framework.errors_impl.InvalidArgumentError:
+            raise ValueError(
+                "Concatenating predictions failed! If your version of TensorFlow is 2.8.0 or older "
+                "then you will need to use --pad_to_max_length to generate predictions, as older "
+                "versions of TensorFlow cannot concatenate variable-length predictions as RaggedTensor."
+            )
+        if isinstance(predictions, tf.RaggedTensor):
+            predictions = predictions.to_tensor(default_value=-100)
+        predictions = tf.math.argmax(predictions, axis=-1).numpy()
+        if "label" in eval_dataset:
+            labels = eval_dataset.with_format("tf")["label"]
+        else:
+            labels = eval_dataset.with_format("tf")["labels"]
+        if isinstance(labels, tf.RaggedTensor):
+            labels = labels.to_tensor(default_value=-100)
+        labels = labels.numpy()
+        attention_mask = eval_dataset.with_format("tf")["attention_mask"]
+        if isinstance(attention_mask, tf.RaggedTensor):
+            attention_mask = attention_mask.to_tensor(default_value=-100)
+        attention_mask = attention_mask.numpy()
+        labels[attention_mask == 0] = -100
        preds, refs = get_labels(predictions, labels)
        metric.add_batch(
            predictions=preds,
@@ -550,12 +572,15 @@ def main():
        logger.info("Evaluation metrics:")
        for key, val in eval_metric.items():
            logger.info(f"{key}: {val:.4f}")
+
+        if training_args.output_dir is not None:
+            output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
+            with open(output_eval_file, "w") as writer:
+                writer.write(json.dumps(eval_metric))
        # endregion

-    # We don't do predictions in the strategy scope because there are some issues in there right now.
-    # They'll get fixed eventually, promise!
-
-    if training_args.output_dir is not None:
+    if training_args.output_dir is not None and not training_args.push_to_hub:
+        # If we're not pushing to hub, at least save a local copy when we're done
        model.save_pretrained(training_args.output_dir)