From 62d847602ab90cbef899a5ef7556a5c59311b9a8 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 8 Mar 2022 13:16:34 +0000 Subject: [PATCH] Update TF multiple choice example (#15868) --- .../tensorflow/multiple-choice/run_swag.py | 137 +++++++++++------- 1 file changed, 88 insertions(+), 49 deletions(-) diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index 499ed5638d..536626a0f9 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -24,10 +24,9 @@ import sys from dataclasses import dataclass, field from itertools import chain from pathlib import Path -from typing import Optional +from typing import Optional, Union import datasets -import numpy as np import tensorflow as tf from datasets import load_dataset @@ -37,12 +36,15 @@ from transformers import ( TF2_WEIGHTS_NAME, AutoConfig, AutoTokenizer, + DefaultDataCollator, HfArgumentParser, TFAutoModelForMultipleChoice, TFTrainingArguments, create_optimizer, set_seed, ) +from transformers.file_utils import PaddingStrategy +from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.utils import check_min_version @@ -65,51 +67,61 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback): self.model.save_pretrained(self.output_dir) -def convert_dataset_for_tensorflow( - dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True -): - """Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches - to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former - is most useful when training on TPU, as a new graph compilation is required for each sequence length. +@dataclass +class DataCollatorForMultipleChoice: + """ + Data collator that will dynamically pad the inputs for multiple choice received. + + Args: + tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): + The tokenizer used for encoding the data. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding index) + among: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= + 7.5 (Volta). """ - def densify_ragged_batch(features, label=None): - features = { - feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items() - } - if label is None: - return features - else: - return features, label + tokenizer: PreTrainedTokenizerBase + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None - feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"])) - if dataset_mode == "variable_batch": - batch_shape = {key: None for key in feature_keys} - data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys} - elif dataset_mode == "constant_batch": - data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys} - batch_shape = { - key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0) - for key, ragged_tensor in data.items() - } - else: - raise ValueError("Unknown dataset mode!") + def __call__(self, features): + label_name = "label" if "label" in features[0].keys() else "labels" + labels = [feature.pop(label_name) for feature in features] + batch_size = len(features) + num_choices = len(features[0]["input_ids"]) + flattened_features = [ + [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features + ] + flattened_features = list(chain(*flattened_features)) - if "label" in dataset.features: - labels = tf.convert_to_tensor(np.array(dataset["label"])) - tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels)) - else: - tf_dataset = tf.data.Dataset.from_tensor_slices(data) - if shuffle: - tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset)) - options = tf.data.Options() - options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF - tf_dataset = ( - tf_dataset.with_options(options) - .batch(batch_size=batch_size, drop_remainder=drop_remainder) - .map(densify_ragged_batch) - ) - return tf_dataset + batch = self.tokenizer.pad( + flattened_features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors="tf", + ) + + # Un-flatten + batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()} + # Add back labels + batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64) + return batch # endregion @@ -382,6 +394,12 @@ def main(): num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) + + if data_args.pad_to_max_length: + data_collator = DefaultDataCollator(return_tensors="tf") + else: + # custom class defined above, as HF has no data collator for multiple choice + data_collator = DataCollatorForMultipleChoice(tokenizer) # endregion with training_args.strategy.scope(): @@ -417,12 +435,26 @@ def main(): # region Training if training_args.do_train: - tf_train_dataset = convert_dataset_for_tensorflow( - train_dataset, non_label_column_names=non_label_columns, batch_size=total_train_batch_size + dataset_exclude_cols = set(non_label_columns + ["label"]) + tf_train_dataset = train_dataset.to_tf_dataset( + columns=[col for col in train_dataset.column_names if col not in dataset_exclude_cols], + shuffle=True, + batch_size=total_train_batch_size, + collate_fn=data_collator, + drop_remainder=True, + # `label_cols` is needed for user-defined losses, such as in this example + label_cols="label" if "label" in train_dataset.column_names else None, ) + if training_args.do_eval: - validation_data = convert_dataset_for_tensorflow( - eval_dataset, non_label_column_names=non_label_columns, batch_size=total_eval_batch_size + validation_data = eval_dataset.to_tf_dataset( + columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols], + shuffle=False, + batch_size=total_eval_batch_size, + collate_fn=data_collator, + drop_remainder=True, + # `label_cols` is needed for user-defined losses, such as in this example + label_cols="label" if "label" in eval_dataset.column_names else None, ) else: validation_data = None @@ -436,9 +468,16 @@ def main(): # region Evaluation if training_args.do_eval and not training_args.do_train: + dataset_exclude_cols = set(non_label_columns + ["label"]) # Do a standalone evaluation pass - tf_eval_dataset = convert_dataset_for_tensorflow( - eval_dataset, non_label_column_names=non_label_columns, batch_size=total_eval_batch_size + tf_eval_dataset = eval_dataset.to_tf_dataset( + columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols], + shuffle=False, + batch_size=total_eval_batch_size, + collate_fn=data_collator, + drop_remainder=True, + # `label_cols` is needed for user-defined losses, such as in this example + label_cols="label" if "label" in eval_dataset.column_names else None, ) model.evaluate(tf_eval_dataset) # endregion