From 3956b133b6086cb597afc252f51c684f63be2d5f Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Mon, 21 Feb 2022 17:17:59 +0000
Subject: [PATCH] TF text classification examples (#15704)

* Working example with to_tf_dataset

* updated text_classification

* more comments
---
 docs/source/main_classes/processors.mdx       |  9 +--
 .../pytorch/text-classification/run_glue.py   |  3 +-
 .../text-classification/run_glue.py           | 65 ++++--------------
 .../run_text_classification.py                | 66 ++++---------------
 src/transformers/modeling_tf_utils.py         |  2 +-
 5 files changed, 32 insertions(+), 113 deletions(-)

diff --git a/docs/source/main_classes/processors.mdx b/docs/source/main_classes/processors.mdx
index 2aaca485df..e0a5c9422f 100644
--- a/docs/source/main_classes/processors.mdx
+++ b/docs/source/main_classes/processors.mdx
@@ -65,12 +65,7 @@ Those processors are:
 Additionally, the following method can be used to load values from a data file and convert them to a list of
 [`~data.processors.utils.InputExample`].
 
-automethod,transformers.data.processors.glue.glue_convert_examples_to_features
-
-
-### Example usage
-
-An example using these processors is given in the [run_glue.py](https://github.com/huggingface/transformers/tree/master/examples/legacy/text-classification/run_glue.py) script.
+[[autodoc]] data.processors.glue.glue_convert_examples_to_features
 
 
 ## XNLI
@@ -114,7 +109,7 @@ They both inherit from the abstract class [`~data.processors.utils.SquadProcesso
 Additionally, the following method can be used to convert SQuAD examples into
 [`~data.processors.utils.SquadFeatures`] that can be used as model inputs.
 
-automethod,transformers.data.processors.squad.squad_convert_examples_to_features
+[[autodoc]] data.processors.squad.squad_convert_examples_to_features
 
 
 These processors as well as the aforementionned method can be used with files containing the data as well as with the
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 5fab1eecb7..5178e1c70c 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -457,7 +457,8 @@ def main():
         else:
             return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
 
-    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
+    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
+    # we already did the padding.
     if data_args.pad_to_max_length:
         data_collator = default_data_collator
     elif training_args.fp16:
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index c517b66627..39a92bc2ef 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -30,6 +30,8 @@ import transformers
 from transformers import (
     AutoConfig,
     AutoTokenizer,
+    DataCollatorWithPadding,
+    DefaultDataCollator,
     HfArgumentParser,
     PretrainedConfig,
     TFAutoModelForSequenceClassification,
@@ -43,47 +45,6 @@ from transformers.utils import check_min_version
 # region Helper functions
 
 
-def convert_dataset_for_tensorflow(
-    dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
-):
-    """Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
-    to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
-    is most useful when training on TPU, as a new graph compilation is required for each sequence length.
-    """
-
-    def densify_ragged_batch(features, label=None):
-        features = {
-            feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
-        }
-        if label is None:
-            return features
-        else:
-            return features, label
-
-    feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
-    if dataset_mode == "variable_batch":
-        batch_shape = {key: None for key in feature_keys}
-        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
-    elif dataset_mode == "constant_batch":
-        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
-        batch_shape = {
-            key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
-            for key, ragged_tensor in data.items()
-        }
-    else:
-        raise ValueError("Unknown dataset mode!")
-
-    if "label" in dataset.features:
-        labels = tf.convert_to_tensor(np.array(dataset["label"]))
-        tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
-    else:
-        tf_dataset = tf.data.Dataset.from_tensor_slices(data)
-    if shuffle:
-        tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
-    tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
-    return tf_dataset
-
-
 class SavePretrainedCallback(tf.keras.callbacks.Callback):
     # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
     # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
@@ -377,6 +338,10 @@ def main():
 
     datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
 
+    if data_args.pad_to_max_length:
+        data_collator = DefaultDataCollator(return_tensors="tf")
+    else:
+        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
     # endregion
 
     # region Metric function
@@ -426,11 +391,6 @@ def main():
 
         # region Convert data to a tf.data.Dataset
         tf_data = dict()
-        if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
-            logger.info("Padding all batches to max length because argument was set or we're on TPU.")
-            dataset_mode = "constant_batch"
-        else:
-            dataset_mode = "variable_batch"
         max_samples = {
             "train": data_args.max_train_samples,
             "validation": data_args.max_eval_samples,
@@ -456,13 +416,14 @@ def main():
             dataset = datasets[key]
             if samples_limit is not None:
                 dataset = dataset.select(range(samples_limit))
-            data = convert_dataset_for_tensorflow(
-                dataset,
-                non_label_column_names,
-                batch_size=batch_size,
-                dataset_mode=dataset_mode,
-                drop_remainder=drop_remainder,
+            data = dataset.to_tf_dataset(
+                columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
                 shuffle=shuffle,
+                batch_size=batch_size,
+                collate_fn=data_collator,
+                drop_remainder=drop_remainder,
+                # `label_cols` is needed for user-defined losses, such as in this example
+                label_cols="label" if "label" in dataset.column_names else None,
             )
             tf_data[key] = data
         # endregion
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index a52d79ef1d..f2693bb1b8 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -29,6 +29,8 @@ from datasets import load_dataset
 from transformers import (
     AutoConfig,
     AutoTokenizer,
+    DataCollatorWithPadding,
+    DefaultDataCollator,
     HfArgumentParser,
     PretrainedConfig,
     TFAutoModelForSequenceClassification,
@@ -58,47 +60,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
         self.model.save_pretrained(self.output_dir)
 
 
-def convert_dataset_for_tensorflow(
-    dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
-):
-    """Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
-    to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
-    is most useful when training on TPU, as a new graph compilation is required for each sequence length.
-    """
-
-    def densify_ragged_batch(features, label=None):
-        features = {
-            feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
-        }
-        if label is None:
-            return features
-        else:
-            return features, label
-
-    feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
-    if dataset_mode == "variable_batch":
-        batch_shape = {key: None for key in feature_keys}
-        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
-    elif dataset_mode == "constant_batch":
-        data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
-        batch_shape = {
-            key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
-            for key, ragged_tensor in data.items()
-        }
-    else:
-        raise ValueError("Unknown dataset mode!")
-
-    if "label" in dataset.features:
-        labels = tf.convert_to_tensor(np.array(dataset["label"]))
-        tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
-    else:
-        tf_dataset = tf.data.Dataset.from_tensor_slices(data)
-    if shuffle:
-        tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
-    tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
-    return tf_dataset
-
-
 # endregion
 
 
@@ -399,6 +360,11 @@ def main():
         return result
 
     datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
+
+    if data_args.pad_to_max_length:
+        data_collator = DefaultDataCollator(return_tensors="tf")
+    else:
+        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
     # endregion
 
     with training_args.strategy.scope():
@@ -464,18 +430,14 @@ def main():
             dataset = datasets[key]
             if samples_limit is not None:
                 dataset = dataset.select(range(samples_limit))
-            if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
-                logger.info("Padding all batches to max length because argument was set or we're on TPU.")
-                dataset_mode = "constant_batch"
-            else:
-                dataset_mode = "variable_batch"
-            data = convert_dataset_for_tensorflow(
-                dataset,
-                non_label_column_names,
-                batch_size=batch_size,
-                dataset_mode=dataset_mode,
-                drop_remainder=drop_remainder,
+            data = dataset.to_tf_dataset(
+                columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
                 shuffle=shuffle,
+                batch_size=batch_size,
+                collate_fn=data_collator,
+                drop_remainder=drop_remainder,
+                # `label_cols` is needed for user-defined losses, such as in this example
+                label_cols="label" if "label" in dataset.column_names else None,
             )
             tf_data[key] = data
         # endregion
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index de2c61ae4c..6a3669f7db 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -884,7 +884,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
 
     def train_step(self, data):
         """
-        A modification of Keras's default train_step that cleans up the printed metrics when we use a dummy loss.
+        A modification of Keras's default `train_step` that cleans up the printed metrics when we use a dummy loss.
         """
         # These are the only transformations `Model.fit` applies to user-input
         # data when a `tf.data.Dataset` is provided.