TF text classification examples (#15704)

* Working example with to_tf_dataset

* updated text_classification

* more comments
This commit is contained in:
Joao Gante
2022-02-21 17:17:59 +00:00
committed by GitHub
parent 142b69f24b
commit 3956b133b6
5 changed files with 32 additions and 113 deletions

View File

@@ -65,12 +65,7 @@ Those processors are:
Additionally, the following method can be used to load values from a data file and convert them to a list of Additionally, the following method can be used to load values from a data file and convert them to a list of
[`~data.processors.utils.InputExample`]. [`~data.processors.utils.InputExample`].
automethod,transformers.data.processors.glue.glue_convert_examples_to_features [[autodoc]] data.processors.glue.glue_convert_examples_to_features
### Example usage
An example using these processors is given in the [run_glue.py](https://github.com/huggingface/transformers/tree/master/examples/legacy/text-classification/run_glue.py) script.
## XNLI ## XNLI
@@ -114,7 +109,7 @@ They both inherit from the abstract class [`~data.processors.utils.SquadProcesso
Additionally, the following method can be used to convert SQuAD examples into Additionally, the following method can be used to convert SQuAD examples into
[`~data.processors.utils.SquadFeatures`] that can be used as model inputs. [`~data.processors.utils.SquadFeatures`] that can be used as model inputs.
automethod,transformers.data.processors.squad.squad_convert_examples_to_features [[autodoc]] data.processors.squad.squad_convert_examples_to_features
These processors as well as the aforementionned method can be used with files containing the data as well as with the These processors as well as the aforementionned method can be used with files containing the data as well as with the

View File

@@ -457,7 +457,8 @@ def main():
else: else:
return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
# we already did the padding.
if data_args.pad_to_max_length: if data_args.pad_to_max_length:
data_collator = default_data_collator data_collator = default_data_collator
elif training_args.fp16: elif training_args.fp16:

View File

@@ -30,6 +30,8 @@ import transformers
from transformers import ( from transformers import (
AutoConfig, AutoConfig,
AutoTokenizer, AutoTokenizer,
DataCollatorWithPadding,
DefaultDataCollator,
HfArgumentParser, HfArgumentParser,
PretrainedConfig, PretrainedConfig,
TFAutoModelForSequenceClassification, TFAutoModelForSequenceClassification,
@@ -43,47 +45,6 @@ from transformers.utils import check_min_version
# region Helper functions # region Helper functions
def convert_dataset_for_tensorflow(
dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
):
"""Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
is most useful when training on TPU, as a new graph compilation is required for each sequence length.
"""
def densify_ragged_batch(features, label=None):
features = {
feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
}
if label is None:
return features
else:
return features, label
feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
if dataset_mode == "variable_batch":
batch_shape = {key: None for key in feature_keys}
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
elif dataset_mode == "constant_batch":
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
batch_shape = {
key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
for key, ragged_tensor in data.items()
}
else:
raise ValueError("Unknown dataset mode!")
if "label" in dataset.features:
labels = tf.convert_to_tensor(np.array(dataset["label"]))
tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
else:
tf_dataset = tf.data.Dataset.from_tensor_slices(data)
if shuffle:
tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
return tf_dataset
class SavePretrainedCallback(tf.keras.callbacks.Callback): class SavePretrainedCallback(tf.keras.callbacks.Callback):
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
@@ -377,6 +338,10 @@ def main():
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
if data_args.pad_to_max_length:
data_collator = DefaultDataCollator(return_tensors="tf")
else:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
# endregion # endregion
# region Metric function # region Metric function
@@ -426,11 +391,6 @@ def main():
# region Convert data to a tf.data.Dataset # region Convert data to a tf.data.Dataset
tf_data = dict() tf_data = dict()
if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
logger.info("Padding all batches to max length because argument was set or we're on TPU.")
dataset_mode = "constant_batch"
else:
dataset_mode = "variable_batch"
max_samples = { max_samples = {
"train": data_args.max_train_samples, "train": data_args.max_train_samples,
"validation": data_args.max_eval_samples, "validation": data_args.max_eval_samples,
@@ -456,13 +416,14 @@ def main():
dataset = datasets[key] dataset = datasets[key]
if samples_limit is not None: if samples_limit is not None:
dataset = dataset.select(range(samples_limit)) dataset = dataset.select(range(samples_limit))
data = convert_dataset_for_tensorflow( data = dataset.to_tf_dataset(
dataset, columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
non_label_column_names,
batch_size=batch_size,
dataset_mode=dataset_mode,
drop_remainder=drop_remainder,
shuffle=shuffle, shuffle=shuffle,
batch_size=batch_size,
collate_fn=data_collator,
drop_remainder=drop_remainder,
# `label_cols` is needed for user-defined losses, such as in this example
label_cols="label" if "label" in dataset.column_names else None,
) )
tf_data[key] = data tf_data[key] = data
# endregion # endregion

View File

@@ -29,6 +29,8 @@ from datasets import load_dataset
from transformers import ( from transformers import (
AutoConfig, AutoConfig,
AutoTokenizer, AutoTokenizer,
DataCollatorWithPadding,
DefaultDataCollator,
HfArgumentParser, HfArgumentParser,
PretrainedConfig, PretrainedConfig,
TFAutoModelForSequenceClassification, TFAutoModelForSequenceClassification,
@@ -58,47 +60,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
self.model.save_pretrained(self.output_dir) self.model.save_pretrained(self.output_dir)
def convert_dataset_for_tensorflow(
dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
):
"""Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
is most useful when training on TPU, as a new graph compilation is required for each sequence length.
"""
def densify_ragged_batch(features, label=None):
features = {
feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
}
if label is None:
return features
else:
return features, label
feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
if dataset_mode == "variable_batch":
batch_shape = {key: None for key in feature_keys}
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
elif dataset_mode == "constant_batch":
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
batch_shape = {
key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
for key, ragged_tensor in data.items()
}
else:
raise ValueError("Unknown dataset mode!")
if "label" in dataset.features:
labels = tf.convert_to_tensor(np.array(dataset["label"]))
tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
else:
tf_dataset = tf.data.Dataset.from_tensor_slices(data)
if shuffle:
tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
return tf_dataset
# endregion # endregion
@@ -399,6 +360,11 @@ def main():
return result return result
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
if data_args.pad_to_max_length:
data_collator = DefaultDataCollator(return_tensors="tf")
else:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
# endregion # endregion
with training_args.strategy.scope(): with training_args.strategy.scope():
@@ -464,18 +430,14 @@ def main():
dataset = datasets[key] dataset = datasets[key]
if samples_limit is not None: if samples_limit is not None:
dataset = dataset.select(range(samples_limit)) dataset = dataset.select(range(samples_limit))
if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length: data = dataset.to_tf_dataset(
logger.info("Padding all batches to max length because argument was set or we're on TPU.") columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
dataset_mode = "constant_batch"
else:
dataset_mode = "variable_batch"
data = convert_dataset_for_tensorflow(
dataset,
non_label_column_names,
batch_size=batch_size,
dataset_mode=dataset_mode,
drop_remainder=drop_remainder,
shuffle=shuffle, shuffle=shuffle,
batch_size=batch_size,
collate_fn=data_collator,
drop_remainder=drop_remainder,
# `label_cols` is needed for user-defined losses, such as in this example
label_cols="label" if "label" in dataset.column_names else None,
) )
tf_data[key] = data tf_data[key] = data
# endregion # endregion

View File

@@ -884,7 +884,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
def train_step(self, data): def train_step(self, data):
""" """
A modification of Keras's default train_step that cleans up the printed metrics when we use a dummy loss. A modification of Keras's default `train_step` that cleans up the printed metrics when we use a dummy loss.
""" """
# These are the only transformations `Model.fit` applies to user-input # These are the only transformations `Model.fit` applies to user-input
# data when a `tf.data.Dataset` is provided. # data when a `tf.data.Dataset` is provided.