From 3956b133b6086cb597afc252f51c684f63be2d5f Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Mon, 21 Feb 2022 17:17:59 +0000 Subject: [PATCH] TF text classification examples (#15704) * Working example with to_tf_dataset * updated text_classification * more comments --- docs/source/main_classes/processors.mdx | 9 +-- .../pytorch/text-classification/run_glue.py | 3 +- .../text-classification/run_glue.py | 65 ++++-------------- .../run_text_classification.py | 66 ++++--------------- src/transformers/modeling_tf_utils.py | 2 +- 5 files changed, 32 insertions(+), 113 deletions(-) diff --git a/docs/source/main_classes/processors.mdx b/docs/source/main_classes/processors.mdx index 2aaca485df..e0a5c9422f 100644 --- a/docs/source/main_classes/processors.mdx +++ b/docs/source/main_classes/processors.mdx @@ -65,12 +65,7 @@ Those processors are: Additionally, the following method can be used to load values from a data file and convert them to a list of [`~data.processors.utils.InputExample`]. -automethod,transformers.data.processors.glue.glue_convert_examples_to_features - - -### Example usage - -An example using these processors is given in the [run_glue.py](https://github.com/huggingface/transformers/tree/master/examples/legacy/text-classification/run_glue.py) script. +[[autodoc]] data.processors.glue.glue_convert_examples_to_features ## XNLI @@ -114,7 +109,7 @@ They both inherit from the abstract class [`~data.processors.utils.SquadProcesso Additionally, the following method can be used to convert SQuAD examples into [`~data.processors.utils.SquadFeatures`] that can be used as model inputs. -automethod,transformers.data.processors.squad.squad_convert_examples_to_features +[[autodoc]] data.processors.squad.squad_convert_examples_to_features These processors as well as the aforementionned method can be used with files containing the data as well as with the diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 5fab1eecb7..5178e1c70c 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -457,7 +457,8 @@ def main(): else: return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} - # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. + # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if + # we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index c517b66627..39a92bc2ef 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -30,6 +30,8 @@ import transformers from transformers import ( AutoConfig, AutoTokenizer, + DataCollatorWithPadding, + DefaultDataCollator, HfArgumentParser, PretrainedConfig, TFAutoModelForSequenceClassification, @@ -43,47 +45,6 @@ from transformers.utils import check_min_version # region Helper functions -def convert_dataset_for_tensorflow( - dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True -): - """Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches - to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former - is most useful when training on TPU, as a new graph compilation is required for each sequence length. - """ - - def densify_ragged_batch(features, label=None): - features = { - feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items() - } - if label is None: - return features - else: - return features, label - - feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"])) - if dataset_mode == "variable_batch": - batch_shape = {key: None for key in feature_keys} - data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys} - elif dataset_mode == "constant_batch": - data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys} - batch_shape = { - key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0) - for key, ragged_tensor in data.items() - } - else: - raise ValueError("Unknown dataset mode!") - - if "label" in dataset.features: - labels = tf.convert_to_tensor(np.array(dataset["label"])) - tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels)) - else: - tf_dataset = tf.data.Dataset.from_tensor_slices(data) - if shuffle: - tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset)) - tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch) - return tf_dataset - - class SavePretrainedCallback(tf.keras.callbacks.Callback): # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback @@ -377,6 +338,10 @@ def main(): datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) + if data_args.pad_to_max_length: + data_collator = DefaultDataCollator(return_tensors="tf") + else: + data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") # endregion # region Metric function @@ -426,11 +391,6 @@ def main(): # region Convert data to a tf.data.Dataset tf_data = dict() - if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length: - logger.info("Padding all batches to max length because argument was set or we're on TPU.") - dataset_mode = "constant_batch" - else: - dataset_mode = "variable_batch" max_samples = { "train": data_args.max_train_samples, "validation": data_args.max_eval_samples, @@ -456,13 +416,14 @@ def main(): dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) - data = convert_dataset_for_tensorflow( - dataset, - non_label_column_names, - batch_size=batch_size, - dataset_mode=dataset_mode, - drop_remainder=drop_remainder, + data = dataset.to_tf_dataset( + columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])], shuffle=shuffle, + batch_size=batch_size, + collate_fn=data_collator, + drop_remainder=drop_remainder, + # `label_cols` is needed for user-defined losses, such as in this example + label_cols="label" if "label" in dataset.column_names else None, ) tf_data[key] = data # endregion diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py index a52d79ef1d..f2693bb1b8 100644 --- a/examples/tensorflow/text-classification/run_text_classification.py +++ b/examples/tensorflow/text-classification/run_text_classification.py @@ -29,6 +29,8 @@ from datasets import load_dataset from transformers import ( AutoConfig, AutoTokenizer, + DataCollatorWithPadding, + DefaultDataCollator, HfArgumentParser, PretrainedConfig, TFAutoModelForSequenceClassification, @@ -58,47 +60,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback): self.model.save_pretrained(self.output_dir) -def convert_dataset_for_tensorflow( - dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True -): - """Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches - to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former - is most useful when training on TPU, as a new graph compilation is required for each sequence length. - """ - - def densify_ragged_batch(features, label=None): - features = { - feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items() - } - if label is None: - return features - else: - return features, label - - feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"])) - if dataset_mode == "variable_batch": - batch_shape = {key: None for key in feature_keys} - data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys} - elif dataset_mode == "constant_batch": - data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys} - batch_shape = { - key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0) - for key, ragged_tensor in data.items() - } - else: - raise ValueError("Unknown dataset mode!") - - if "label" in dataset.features: - labels = tf.convert_to_tensor(np.array(dataset["label"])) - tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels)) - else: - tf_dataset = tf.data.Dataset.from_tensor_slices(data) - if shuffle: - tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset)) - tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch) - return tf_dataset - - # endregion @@ -399,6 +360,11 @@ def main(): return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) + + if data_args.pad_to_max_length: + data_collator = DefaultDataCollator(return_tensors="tf") + else: + data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") # endregion with training_args.strategy.scope(): @@ -464,18 +430,14 @@ def main(): dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) - if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length: - logger.info("Padding all batches to max length because argument was set or we're on TPU.") - dataset_mode = "constant_batch" - else: - dataset_mode = "variable_batch" - data = convert_dataset_for_tensorflow( - dataset, - non_label_column_names, - batch_size=batch_size, - dataset_mode=dataset_mode, - drop_remainder=drop_remainder, + data = dataset.to_tf_dataset( + columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])], shuffle=shuffle, + batch_size=batch_size, + collate_fn=data_collator, + drop_remainder=drop_remainder, + # `label_cols` is needed for user-defined losses, such as in this example + label_cols="label" if "label" in dataset.column_names else None, ) tf_data[key] = data # endregion diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index de2c61ae4c..6a3669f7db 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -884,7 +884,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu def train_step(self, data): """ - A modification of Keras's default train_step that cleans up the printed metrics when we use a dummy loss. + A modification of Keras's default `train_step` that cleans up the printed metrics when we use a dummy loss. """ # These are the only transformations `Model.fit` applies to user-input # data when a `tf.data.Dataset` is provided.