TF text classification examples (#15704)
* Working example with to_tf_dataset * updated text_classification * more comments
This commit is contained in:
@@ -65,12 +65,7 @@ Those processors are:
|
||||
Additionally, the following method can be used to load values from a data file and convert them to a list of
|
||||
[`~data.processors.utils.InputExample`].
|
||||
|
||||
automethod,transformers.data.processors.glue.glue_convert_examples_to_features
|
||||
|
||||
|
||||
### Example usage
|
||||
|
||||
An example using these processors is given in the [run_glue.py](https://github.com/huggingface/transformers/tree/master/examples/legacy/text-classification/run_glue.py) script.
|
||||
[[autodoc]] data.processors.glue.glue_convert_examples_to_features
|
||||
|
||||
|
||||
## XNLI
|
||||
@@ -114,7 +109,7 @@ They both inherit from the abstract class [`~data.processors.utils.SquadProcesso
|
||||
Additionally, the following method can be used to convert SQuAD examples into
|
||||
[`~data.processors.utils.SquadFeatures`] that can be used as model inputs.
|
||||
|
||||
automethod,transformers.data.processors.squad.squad_convert_examples_to_features
|
||||
[[autodoc]] data.processors.squad.squad_convert_examples_to_features
|
||||
|
||||
|
||||
These processors as well as the aforementionned method can be used with files containing the data as well as with the
|
||||
|
||||
@@ -457,7 +457,8 @@ def main():
|
||||
else:
|
||||
return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
|
||||
|
||||
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
|
||||
# Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
|
||||
# we already did the padding.
|
||||
if data_args.pad_to_max_length:
|
||||
data_collator = default_data_collator
|
||||
elif training_args.fp16:
|
||||
|
||||
@@ -30,6 +30,8 @@ import transformers
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
DataCollatorWithPadding,
|
||||
DefaultDataCollator,
|
||||
HfArgumentParser,
|
||||
PretrainedConfig,
|
||||
TFAutoModelForSequenceClassification,
|
||||
@@ -43,47 +45,6 @@ from transformers.utils import check_min_version
|
||||
# region Helper functions
|
||||
|
||||
|
||||
def convert_dataset_for_tensorflow(
|
||||
dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
|
||||
):
|
||||
"""Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
|
||||
to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
|
||||
is most useful when training on TPU, as a new graph compilation is required for each sequence length.
|
||||
"""
|
||||
|
||||
def densify_ragged_batch(features, label=None):
|
||||
features = {
|
||||
feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
|
||||
}
|
||||
if label is None:
|
||||
return features
|
||||
else:
|
||||
return features, label
|
||||
|
||||
feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
|
||||
if dataset_mode == "variable_batch":
|
||||
batch_shape = {key: None for key in feature_keys}
|
||||
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
|
||||
elif dataset_mode == "constant_batch":
|
||||
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
|
||||
batch_shape = {
|
||||
key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
|
||||
for key, ragged_tensor in data.items()
|
||||
}
|
||||
else:
|
||||
raise ValueError("Unknown dataset mode!")
|
||||
|
||||
if "label" in dataset.features:
|
||||
labels = tf.convert_to_tensor(np.array(dataset["label"]))
|
||||
tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
|
||||
else:
|
||||
tf_dataset = tf.data.Dataset.from_tensor_slices(data)
|
||||
if shuffle:
|
||||
tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
|
||||
tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
|
||||
return tf_dataset
|
||||
|
||||
|
||||
class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
||||
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
|
||||
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
|
||||
@@ -377,6 +338,10 @@ def main():
|
||||
|
||||
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
|
||||
|
||||
if data_args.pad_to_max_length:
|
||||
data_collator = DefaultDataCollator(return_tensors="tf")
|
||||
else:
|
||||
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
|
||||
# endregion
|
||||
|
||||
# region Metric function
|
||||
@@ -426,11 +391,6 @@ def main():
|
||||
|
||||
# region Convert data to a tf.data.Dataset
|
||||
tf_data = dict()
|
||||
if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
|
||||
logger.info("Padding all batches to max length because argument was set or we're on TPU.")
|
||||
dataset_mode = "constant_batch"
|
||||
else:
|
||||
dataset_mode = "variable_batch"
|
||||
max_samples = {
|
||||
"train": data_args.max_train_samples,
|
||||
"validation": data_args.max_eval_samples,
|
||||
@@ -456,13 +416,14 @@ def main():
|
||||
dataset = datasets[key]
|
||||
if samples_limit is not None:
|
||||
dataset = dataset.select(range(samples_limit))
|
||||
data = convert_dataset_for_tensorflow(
|
||||
dataset,
|
||||
non_label_column_names,
|
||||
batch_size=batch_size,
|
||||
dataset_mode=dataset_mode,
|
||||
drop_remainder=drop_remainder,
|
||||
data = dataset.to_tf_dataset(
|
||||
columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
|
||||
shuffle=shuffle,
|
||||
batch_size=batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=drop_remainder,
|
||||
# `label_cols` is needed for user-defined losses, such as in this example
|
||||
label_cols="label" if "label" in dataset.column_names else None,
|
||||
)
|
||||
tf_data[key] = data
|
||||
# endregion
|
||||
|
||||
@@ -29,6 +29,8 @@ from datasets import load_dataset
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
DataCollatorWithPadding,
|
||||
DefaultDataCollator,
|
||||
HfArgumentParser,
|
||||
PretrainedConfig,
|
||||
TFAutoModelForSequenceClassification,
|
||||
@@ -58,47 +60,6 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
||||
self.model.save_pretrained(self.output_dir)
|
||||
|
||||
|
||||
def convert_dataset_for_tensorflow(
|
||||
dataset, non_label_column_names, batch_size, dataset_mode="variable_batch", shuffle=True, drop_remainder=True
|
||||
):
|
||||
"""Converts a Hugging Face dataset to a Tensorflow Dataset. The dataset_mode controls whether we pad all batches
|
||||
to the maximum sequence length, or whether we only pad to the maximum length within that batch. The former
|
||||
is most useful when training on TPU, as a new graph compilation is required for each sequence length.
|
||||
"""
|
||||
|
||||
def densify_ragged_batch(features, label=None):
|
||||
features = {
|
||||
feature: ragged_tensor.to_tensor(shape=batch_shape[feature]) for feature, ragged_tensor in features.items()
|
||||
}
|
||||
if label is None:
|
||||
return features
|
||||
else:
|
||||
return features, label
|
||||
|
||||
feature_keys = list(set(dataset.features.keys()) - set(non_label_column_names + ["label"]))
|
||||
if dataset_mode == "variable_batch":
|
||||
batch_shape = {key: None for key in feature_keys}
|
||||
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
|
||||
elif dataset_mode == "constant_batch":
|
||||
data = {key: tf.ragged.constant(dataset[key]) for key in feature_keys}
|
||||
batch_shape = {
|
||||
key: tf.concat(([batch_size], ragged_tensor.bounding_shape()[1:]), axis=0)
|
||||
for key, ragged_tensor in data.items()
|
||||
}
|
||||
else:
|
||||
raise ValueError("Unknown dataset mode!")
|
||||
|
||||
if "label" in dataset.features:
|
||||
labels = tf.convert_to_tensor(np.array(dataset["label"]))
|
||||
tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
|
||||
else:
|
||||
tf_dataset = tf.data.Dataset.from_tensor_slices(data)
|
||||
if shuffle:
|
||||
tf_dataset = tf_dataset.shuffle(buffer_size=len(dataset))
|
||||
tf_dataset = tf_dataset.batch(batch_size=batch_size, drop_remainder=drop_remainder).map(densify_ragged_batch)
|
||||
return tf_dataset
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
@@ -399,6 +360,11 @@ def main():
|
||||
return result
|
||||
|
||||
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
|
||||
|
||||
if data_args.pad_to_max_length:
|
||||
data_collator = DefaultDataCollator(return_tensors="tf")
|
||||
else:
|
||||
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
|
||||
# endregion
|
||||
|
||||
with training_args.strategy.scope():
|
||||
@@ -464,18 +430,14 @@ def main():
|
||||
dataset = datasets[key]
|
||||
if samples_limit is not None:
|
||||
dataset = dataset.select(range(samples_limit))
|
||||
if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length:
|
||||
logger.info("Padding all batches to max length because argument was set or we're on TPU.")
|
||||
dataset_mode = "constant_batch"
|
||||
else:
|
||||
dataset_mode = "variable_batch"
|
||||
data = convert_dataset_for_tensorflow(
|
||||
dataset,
|
||||
non_label_column_names,
|
||||
batch_size=batch_size,
|
||||
dataset_mode=dataset_mode,
|
||||
drop_remainder=drop_remainder,
|
||||
data = dataset.to_tf_dataset(
|
||||
columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
|
||||
shuffle=shuffle,
|
||||
batch_size=batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=drop_remainder,
|
||||
# `label_cols` is needed for user-defined losses, such as in this example
|
||||
label_cols="label" if "label" in dataset.column_names else None,
|
||||
)
|
||||
tf_data[key] = data
|
||||
# endregion
|
||||
|
||||
@@ -884,7 +884,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
|
||||
|
||||
def train_step(self, data):
|
||||
"""
|
||||
A modification of Keras's default train_step that cleans up the printed metrics when we use a dummy loss.
|
||||
A modification of Keras's default `train_step` that cleans up the printed metrics when we use a dummy loss.
|
||||
"""
|
||||
# These are the only transformations `Model.fit` applies to user-input
|
||||
# data when a `tf.data.Dataset` is provided.
|
||||
|
||||
Reference in New Issue
Block a user