Update TF multiple choice example (#15868)

2022-03-08 13:16:34 +00:00
parent ab2f8d12a7
commit 62d847602a
1 changed files with 88 additions and 49 deletions
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -24,10 +24,9 @@ import sys
 from dataclasses import dataclass, field
 from itertools import chain
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 import datasets
 import numpy as np
 import tensorflow as tf
 from datasets import load_dataset
@@ -37,12 +36,15 @@ from transformers import (
    TF2_WEIGHTS_NAME,
    AutoConfig,
    AutoTokenizer,
    DefaultDataCollator,
    HfArgumentParser,
    TFAutoModelForMultipleChoice,
    TFTrainingArguments,
    create_optimizer,
    set_seed,
 )
 from transformers.file_utils import PaddingStrategy
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.utils import check_min_version
@@ -65,51 +67,61 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
        self.model.save_pretrained(self.output_dir)
-def convert_dataset_for_tensorflow(
+@dataclass
-    dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
+class DataCollatorForMultipleChoice:
-):
+    """
-    """Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
+    Data collator that will dynamically pad the inputs for multiple choice received.
-    to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
+
-    is most useful when training on TPU, as a new graph compilation is required for each sequence length.
+    Args:
        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
            The tokenizer used for encoding the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """
-    def densify_ragged_batch(features, label=None):
+    tokenizer: PreTrainedTokenizerBase
-        features = {
+    padding: Union[bool, str, PaddingStrategy] = True
-            feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
+    max_length: Optional[int] = None
-        }
+    pad_to_multiple_of: Optional[int] = None
        if label is None:
            return features
        else:
            return features, label
-    feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
+    def __call__(self, features):
-    if dataset_mode == "variable_batch":
+        label_name = "label" if "label" in features[0].keys() else "labels"
-        batch_shape = {key: None for key in feature_keys}
+        labels = [feature.pop(label_name) for feature in features]
-        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
+        batch_size = len(features)
-    elif dataset_mode == "constant_batch":
+        num_choices = len(features[0]["input_ids"])
-        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
+        flattened_features = [
-        batch_shape = {
+            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
-            key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
+        ]
-            for key, ragged_tensor in data.items()
+        flattened_features = list(chain(*flattened_features))
        }
    else:
        raise ValueError("Unknown dataset mode!")
-    if "label" in dataset.features:
+        batch = self.tokenizer.pad(
-        labels = tf.convert_to_tensor(np.array(dataset["label"]))
+            flattened_features,
-        tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
+            padding=self.padding,
-    else:
+            max_length=self.max_length,
-        tf_dataset = tf.data.Dataset.from_tensor_slices(data)
+            pad_to_multiple_of=self.pad_to_multiple_of,
-    if shuffle:
+            return_tensors="tf",
-        tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
+        )
-    options = tf.data.Options()
+
-    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+        # Un-flatten
-    tf_dataset = (
+        batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
-        tf_dataset.with_options(options)
+        # Add back labels
-        .batch(batch_size=batch_size, drop_remainder=drop_remainder)
+        batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
-        .map(densify_ragged_batch)
+        return batch
    )
    return tf_dataset
 # endregion
@@ -382,6 +394,12 @@ def main():
                num_proc=data_args.preprocessing_num_workers,
                load_from_cache_file=not data_args.overwrite_cache,
            )
    if data_args.pad_to_max_length:
        data_collator = DefaultDataCollator(return_tensors="tf")
    else:
        # custom class defined above, as HF has no data collator for multiple choice
        data_collator = DataCollatorForMultipleChoice(tokenizer)
    # endregion
    with training_args.strategy.scope():
@@ -417,12 +435,26 @@ def main():
        # region Training
        if training_args.do_train:
-            tf_train_dataset = convert_dataset_for_tensorflow(
+            dataset_exclude_cols = set(non_label_columns + ["label"])
-                train_dataset, non_label_column_names=non_label_columns, batch_size=total_train_batch_size
+            tf_train_dataset = train_dataset.to_tf_dataset(
                columns=[col for col in train_dataset.column_names if col not in dataset_exclude_cols],
                shuffle=True,
                batch_size=total_train_batch_size,
                collate_fn=data_collator,
                drop_remainder=True,
                # `label_cols` is needed for user-defined losses, such as in this example
                label_cols="label" if "label" in train_dataset.column_names else None,
            )
            if training_args.do_eval:
-                validation_data = convert_dataset_for_tensorflow(
+                validation_data = eval_dataset.to_tf_dataset(
-                    eval_dataset, non_label_column_names=non_label_columns, batch_size=total_eval_batch_size
+                    columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols],
                    shuffle=False,
                    batch_size=total_eval_batch_size,
                    collate_fn=data_collator,
                    drop_remainder=True,
                    # `label_cols` is needed for user-defined losses, such as in this example
                    label_cols="label" if "label" in eval_dataset.column_names else None,
                )
            else:
                validation_data = None
@@ -436,9 +468,16 @@ def main():
        # region Evaluation
        if training_args.do_eval and not training_args.do_train:
            dataset_exclude_cols = set(non_label_columns + ["label"])
            # Do a standalone evaluation pass
-            tf_eval_dataset = convert_dataset_for_tensorflow(
+            tf_eval_dataset = eval_dataset.to_tf_dataset(
-                eval_dataset, non_label_column_names=non_label_columns, batch_size=total_eval_batch_size
+                columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols],
                shuffle=False,
                batch_size=total_eval_batch_size,
                collate_fn=data_collator,
                drop_remainder=True,
                # `label_cols` is needed for user-defined losses, such as in this example
                label_cols="label" if "label" in eval_dataset.column_names else None,
            )
            model.evaluate(tf_eval_dataset)
        # endregion