TF Examples Rewrite (#18451)
* Finished QA example * Dodge a merge conflict * Update text classification and LM examples * Update NER example * New Keras metrics WIP, fix NER example * Update NER example * Update MC, summarization and translation examples * Add XLA warnings when shapes are variable * Make sure batch_size is consistently scaled by num_replicas * Add PushToHubCallback to all models * Add docs links for KerasMetricCallback * Add docs links for prepare_tf_dataset and jit_compile * Correct inferred model names * Don't assume the dataset has 'lang' * Don't assume the dataset has 'lang' * Write metrics in text classification * Add 'framework' to TrainingArguments and TFTrainingArguments * Export metrics in all examples and add tests * Fix training args for Flax * Update command line args for translation test * make fixup * Fix accidentally running other tests in fp16 * Remove do_train/do_eval from run_clm.py * Remove do_train/do_eval from run_mlm.py * Add tensorflow tests to circleci * Fix circleci * Update examples/tensorflow/language-modeling/run_mlm.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/test_tensorflow_examples.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/translation/run_translation.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/token-classification/run_ner.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Fix save path for tests * Fix some model card kwargs * Explain the magical -1000 * Actually enable tests this time * Skip text classification PR until we fix shape inference * make fixup Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
@@ -658,6 +658,71 @@ jobs:
|
||||
- store_artifacts:
|
||||
path: ~/transformers/reports
|
||||
|
||||
run_examples_tensorflow:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: cimg/python:3.7.12
|
||||
environment:
|
||||
OMP_NUM_THREADS: 1
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
PYTEST_TIMEOUT: 120
|
||||
resource_class: xlarge
|
||||
parallelism: 1
|
||||
steps:
|
||||
- checkout
|
||||
- restore_cache:
|
||||
keys:
|
||||
- v0.5-tensorflow_examples-{{ checksum "setup.py" }}
|
||||
- v0.5-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,tensorflow,sentencepiece,testing]
|
||||
- run: pip install -r examples/tensorflow/_tests_requirements.txt
|
||||
- save_cache:
|
||||
key: v0.5-tensorflow_examples-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/test_preparation.txt
|
||||
- run: |
|
||||
if [ -f test_list.txt ]; then
|
||||
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee tests_output.txt
|
||||
fi
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tensorflow_examples_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/reports
|
||||
|
||||
run_examples_tensorflow_all:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: cimg/python:3.7.12
|
||||
environment:
|
||||
OMP_NUM_THREADS: 1
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
PYTEST_TIMEOUT: 120
|
||||
resource_class: xlarge
|
||||
parallelism: 1
|
||||
steps:
|
||||
- checkout
|
||||
- restore_cache:
|
||||
keys:
|
||||
- v0.5-tensorflow_examples-{{ checksum "setup.py" }}
|
||||
- v0.5-{{ checksum "setup.py" }}
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install .[sklearn,tensorflow,sentencepiece,testing]
|
||||
- run: pip install -r examples/tensorflow/_tests_requirements.txt
|
||||
- save_cache:
|
||||
key: v0.5-tensorflow_examples-{{ checksum "setup.py" }}
|
||||
paths:
|
||||
- '~/.cache/pip'
|
||||
- run: |
|
||||
TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee examples_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/tensorflow_examples_output.txt
|
||||
- store_artifacts:
|
||||
path: ~/transformers/reports
|
||||
|
||||
run_examples_flax:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
@@ -1000,6 +1065,7 @@ workflows:
|
||||
- check_code_quality
|
||||
- check_repository_consistency
|
||||
- run_examples_torch
|
||||
- run_examples_tensorflow
|
||||
- run_examples_flax
|
||||
- run_tests_custom_tokenizers
|
||||
- run_tests_torch_and_tf
|
||||
@@ -1022,6 +1088,7 @@ workflows:
|
||||
- main
|
||||
jobs:
|
||||
- run_examples_torch_all
|
||||
- run_examples_tensorflow_all
|
||||
- run_examples_flax_all
|
||||
- run_tests_torch_and_tf_all
|
||||
- run_tests_torch_and_flax_all
|
||||
|
||||
25
examples/tensorflow/_tests_requirements.txt
Normal file
25
examples/tensorflow/_tests_requirements.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
tensorflow
|
||||
tensorboard
|
||||
scikit-learn
|
||||
seqeval
|
||||
psutil
|
||||
sacrebleu >= 1.4.12
|
||||
git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
rouge-score
|
||||
tensorflow_datasets
|
||||
matplotlib
|
||||
git-python==1.0.3
|
||||
faiss-cpu
|
||||
streamlit
|
||||
elasticsearch
|
||||
nltk
|
||||
pandas
|
||||
datasets >= 1.13.3
|
||||
fire
|
||||
pytest
|
||||
conllu
|
||||
sentencepiece != 0.1.92
|
||||
protobuf
|
||||
jiwer
|
||||
librosa
|
||||
evaluate >= 0.2.0
|
||||
@@ -22,6 +22,8 @@ https://huggingface.co/models?filter=text-generation
|
||||
"""
|
||||
# You can also adapt this script on your own clm task. Pointers for this are left as comments.
|
||||
|
||||
import json
|
||||
|
||||
# region Imports
|
||||
import logging
|
||||
import math
|
||||
@@ -46,8 +48,8 @@ from transformers import (
|
||||
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
DefaultDataCollator,
|
||||
HfArgumentParser,
|
||||
PushToHubCallback,
|
||||
TFAutoModelForCausalLM,
|
||||
TFTrainingArguments,
|
||||
create_optimizer,
|
||||
@@ -205,21 +207,6 @@ class DataTrainingArguments:
|
||||
assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
# region Helper classes
|
||||
class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
||||
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
|
||||
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
|
||||
# that saves the model with this method after each epoch.
|
||||
def __init__(self, output_dir, **kwargs):
|
||||
super().__init__()
|
||||
self.output_dir = output_dir
|
||||
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
self.model.save_pretrained(self.output_dir)
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
@@ -299,6 +286,7 @@ def main():
|
||||
raw_datasets = load_dataset(
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
@@ -306,12 +294,14 @@ def main():
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
split=f"train[:{data_args.validation_split_percentage}%]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
split=f"train[{data_args.validation_split_percentage}%:]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
else:
|
||||
@@ -321,16 +311,39 @@ def main():
|
||||
data_files["train"] = data_args.train_file
|
||||
if data_args.validation_file is not None:
|
||||
data_files["validation"] = data_args.validation_file
|
||||
extension = data_args.train_file.split(".")[-1]
|
||||
extension = (
|
||||
data_args.train_file.split(".")[-1]
|
||||
if data_args.train_file is not None
|
||||
else data_args.validation_file.split(".")[-1]
|
||||
)
|
||||
if extension == "txt":
|
||||
extension = "text"
|
||||
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
|
||||
raw_datasets = load_dataset(
|
||||
extension,
|
||||
data_files=data_files,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
**dataset_args,
|
||||
)
|
||||
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
extension,
|
||||
data_files=data_files,
|
||||
split=f"train[:{data_args.validation_split_percentage}%]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
**dataset_args,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
extension,
|
||||
data_files=data_files,
|
||||
split=f"train[{data_args.validation_split_percentage}%:]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
**dataset_args,
|
||||
)
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
# endregion
|
||||
@@ -446,7 +459,7 @@ def main():
|
||||
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||
|
||||
# Log a few random samples from the training set:
|
||||
for index in random.sample(range(len(train_dataset)), 3):
|
||||
for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
|
||||
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
||||
# endregion
|
||||
|
||||
@@ -465,44 +478,88 @@ def main():
|
||||
|
||||
# region TF Dataset preparation
|
||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||
data_collator = DefaultDataCollator(return_tensors="tf")
|
||||
options = tf.data.Options()
|
||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
|
||||
tf_train_dataset = train_dataset.to_tf_dataset(
|
||||
# labels are passed as input, as we will use the model's internal loss
|
||||
columns=[col for col in train_dataset.features if col != "special_tokens_mask"],
|
||||
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||
# using model.prepare_tf_dataset()
|
||||
# For more info see the docs:
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||
|
||||
tf_train_dataset = model.prepare_tf_dataset(
|
||||
train_dataset,
|
||||
shuffle=True,
|
||||
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=True,
|
||||
).with_options(options)
|
||||
|
||||
tf_eval_dataset = eval_dataset.to_tf_dataset(
|
||||
# labels are passed as input, as we will use the model's internal loss
|
||||
columns=[col for col in eval_dataset.features if col != "special_tokens_mask"],
|
||||
tf_eval_dataset = model.prepare_tf_dataset(
|
||||
eval_dataset,
|
||||
shuffle=False,
|
||||
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
||||
collate_fn=data_collator,
|
||||
batch_size=num_replicas * training_args.per_device_eval_batch_size,
|
||||
drop_remainder=True,
|
||||
).with_options(options)
|
||||
# endregion
|
||||
|
||||
# region Optimizer and loss
|
||||
batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
|
||||
num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
|
||||
if training_args.warmup_steps > 0:
|
||||
num_warmup_steps = training_args.warmup_steps
|
||||
elif training_args.warmup_ratio > 0:
|
||||
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||
else:
|
||||
num_warmup_steps = 0
|
||||
|
||||
# Bias and layernorm weights are automatically excluded from the decay
|
||||
optimizer, lr_schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
|
||||
num_warmup_steps=training_args.warmup_steps,
|
||||
num_train_steps=num_train_steps,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
adam_beta1=training_args.adam_beta1,
|
||||
adam_beta2=training_args.adam_beta2,
|
||||
adam_epsilon=training_args.adam_epsilon,
|
||||
weight_decay_rate=training_args.weight_decay,
|
||||
adam_global_clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
|
||||
# no user-specified loss = will use the model internal loss
|
||||
model.compile(optimizer=optimizer)
|
||||
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
|
||||
# endregion
|
||||
|
||||
# region Preparing push_to_hub and model card
|
||||
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||
if not push_to_hub_model_id:
|
||||
if data_args.dataset_name is not None:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||
else:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-clm"
|
||||
|
||||
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
|
||||
if data_args.dataset_name is not None:
|
||||
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||
if data_args.dataset_config_name is not None:
|
||||
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||
else:
|
||||
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||
|
||||
if training_args.push_to_hub:
|
||||
callbacks = [
|
||||
PushToHubCallback(
|
||||
output_dir=training_args.output_dir,
|
||||
model_id=push_to_hub_model_id,
|
||||
organization=training_args.push_to_hub_organization,
|
||||
token=training_args.push_to_hub_token,
|
||||
tokenizer=tokenizer,
|
||||
**model_card_kwargs,
|
||||
)
|
||||
]
|
||||
else:
|
||||
callbacks = []
|
||||
# endregion
|
||||
|
||||
# region Training and validation
|
||||
@@ -512,33 +569,45 @@ def main():
|
||||
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
|
||||
logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
|
||||
|
||||
# For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
|
||||
# to the Hugging Face Hub rather than just pushing the finished model.
|
||||
# See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
|
||||
|
||||
history = model.fit(
|
||||
tf_train_dataset,
|
||||
validation_data=tf_eval_dataset,
|
||||
epochs=int(training_args.num_train_epochs),
|
||||
steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
|
||||
callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
|
||||
callbacks=callbacks,
|
||||
)
|
||||
train_loss = history.history["loss"][-1]
|
||||
try:
|
||||
train_perplexity = math.exp(history.history["loss"][-1])
|
||||
train_perplexity = math.exp(train_loss)
|
||||
except OverflowError:
|
||||
train_perplexity = math.inf
|
||||
logger.info(f" Final train loss: {train_loss:.3f}")
|
||||
logger.info(f" Final train perplexity: {train_perplexity:.3f}")
|
||||
validation_loss = history.history["val_loss"][-1]
|
||||
try:
|
||||
validation_perplexity = math.exp(history.history["val_loss"][-1])
|
||||
validation_perplexity = math.exp(validation_loss)
|
||||
except OverflowError:
|
||||
validation_perplexity = math.inf
|
||||
logger.info(f" Final train loss: {history.history['loss'][-1]:.3f}")
|
||||
logger.info(f" Final train perplexity: {train_perplexity:.3f}")
|
||||
logger.info(f" Final validation loss: {history.history['val_loss'][-1]:.3f}")
|
||||
logger.info(f" Final validation loss: {validation_loss:.3f}")
|
||||
logger.info(f" Final validation perplexity: {validation_perplexity:.3f}")
|
||||
# endregion
|
||||
|
||||
if training_args.output_dir is not None:
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||
results_dict = dict()
|
||||
results_dict["train_loss"] = train_loss
|
||||
results_dict["train_perplexity"] = train_perplexity
|
||||
results_dict["eval_loss"] = validation_loss
|
||||
results_dict["eval_perplexity"] = validation_perplexity
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write(json.dumps(results_dict))
|
||||
# endregion
|
||||
|
||||
if training_args.push_to_hub:
|
||||
# You'll probably want to include some of your own metadata here!
|
||||
model.push_to_hub()
|
||||
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||
# If we're not pushing to hub, at least save a local copy when we're done
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -22,9 +22,7 @@ https://huggingface.co/models?filter=fill-mask
|
||||
"""
|
||||
# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
|
||||
|
||||
# TODO Do multi-GPU and TPU tests and make sure the dataset length works as expected
|
||||
# TODO Duplicate all changes over to the CLM script
|
||||
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
@@ -50,6 +48,7 @@ from transformers import (
|
||||
AutoTokenizer,
|
||||
DataCollatorForLanguageModeling,
|
||||
HfArgumentParser,
|
||||
PushToHubCallback,
|
||||
TFAutoModelForMaskedLM,
|
||||
TFTrainingArguments,
|
||||
create_optimizer,
|
||||
@@ -217,22 +216,6 @@ class DataTrainingArguments:
|
||||
# endregion
|
||||
|
||||
|
||||
# region Helper classes
|
||||
class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
||||
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
|
||||
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
|
||||
# that saves the model with this method after each epoch.
|
||||
def __init__(self, output_dir, **kwargs):
|
||||
super().__init__()
|
||||
self.output_dir = output_dir
|
||||
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
self.model.save_pretrained(self.output_dir)
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
def main():
|
||||
# region Argument Parsing
|
||||
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
|
||||
@@ -492,7 +475,7 @@ def main():
|
||||
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||
|
||||
# Log a few random samples from the training set:
|
||||
for index in random.sample(range(len(train_dataset)), 3):
|
||||
for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
|
||||
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
||||
# endregion
|
||||
|
||||
@@ -517,40 +500,88 @@ def main():
|
||||
options = tf.data.Options()
|
||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
|
||||
tf_train_dataset = train_dataset.to_tf_dataset(
|
||||
# labels are passed as input, as we will use the model's internal loss
|
||||
columns=[col for col in train_dataset.features if col != "special_tokens_mask"] + ["labels"],
|
||||
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||
# using model.prepare_tf_dataset()
|
||||
# For more info see the docs:
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||
|
||||
tf_train_dataset = model.prepare_tf_dataset(
|
||||
train_dataset,
|
||||
shuffle=True,
|
||||
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=True,
|
||||
).with_options(options)
|
||||
|
||||
tf_eval_dataset = eval_dataset.to_tf_dataset(
|
||||
tf_eval_dataset = model.prepare_tf_dataset(
|
||||
eval_dataset,
|
||||
# labels are passed as input, as we will use the model's internal loss
|
||||
columns=[col for col in eval_dataset.features if col != "special_tokens_mask"] + ["labels"],
|
||||
shuffle=False,
|
||||
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
||||
batch_size=num_replicas * training_args.per_device_eval_batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=True,
|
||||
).with_options(options)
|
||||
# endregion
|
||||
|
||||
# region Optimizer and loss
|
||||
batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
|
||||
num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
|
||||
if training_args.warmup_steps > 0:
|
||||
num_warmup_steps = training_args.warmup_steps
|
||||
elif training_args.warmup_ratio > 0:
|
||||
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||
else:
|
||||
num_warmup_steps = 0
|
||||
|
||||
# Bias and layernorm weights are automatically excluded from the decay
|
||||
optimizer, lr_schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
|
||||
num_warmup_steps=training_args.warmup_steps,
|
||||
num_train_steps=num_train_steps,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
adam_beta1=training_args.adam_beta1,
|
||||
adam_beta2=training_args.adam_beta2,
|
||||
adam_epsilon=training_args.adam_epsilon,
|
||||
weight_decay_rate=training_args.weight_decay,
|
||||
adam_global_clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
|
||||
# no user-specified loss = will use the model internal loss
|
||||
model.compile(optimizer=optimizer)
|
||||
model.compile(optimizer=optimizer, jit_compile=training_args.xla, run_eagerly=True)
|
||||
# endregion
|
||||
|
||||
# region Preparing push_to_hub and model card
|
||||
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||
if not push_to_hub_model_id:
|
||||
if data_args.dataset_name is not None:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||
else:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-mlm"
|
||||
|
||||
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
|
||||
if data_args.dataset_name is not None:
|
||||
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||
if data_args.dataset_config_name is not None:
|
||||
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||
else:
|
||||
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||
|
||||
if training_args.push_to_hub:
|
||||
callbacks = [
|
||||
PushToHubCallback(
|
||||
output_dir=training_args.output_dir,
|
||||
model_id=push_to_hub_model_id,
|
||||
organization=training_args.push_to_hub_organization,
|
||||
token=training_args.push_to_hub_token,
|
||||
tokenizer=tokenizer,
|
||||
**model_card_kwargs,
|
||||
)
|
||||
]
|
||||
else:
|
||||
callbacks = []
|
||||
# endregion
|
||||
|
||||
# region Training and validation
|
||||
@@ -560,33 +591,46 @@ def main():
|
||||
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
|
||||
logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
|
||||
|
||||
# For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
|
||||
# to the Hugging Face Hub rather than just pushing the finished model.
|
||||
# See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
|
||||
|
||||
history = model.fit(
|
||||
tf_train_dataset,
|
||||
validation_data=tf_eval_dataset,
|
||||
epochs=int(training_args.num_train_epochs),
|
||||
steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
|
||||
callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
|
||||
callbacks=callbacks,
|
||||
)
|
||||
train_loss = history.history["loss"][-1]
|
||||
try:
|
||||
train_perplexity = math.exp(history.history["loss"][-1])
|
||||
train_perplexity = math.exp(train_loss)
|
||||
except OverflowError:
|
||||
train_perplexity = math.inf
|
||||
try:
|
||||
validation_perplexity = math.exp(history.history["val_loss"][-1])
|
||||
except OverflowError:
|
||||
validation_perplexity = math.inf
|
||||
logger.warning(f" Final train loss: {history.history['loss'][-1]:.3f}")
|
||||
logger.warning(f" Final train perplexity: {train_perplexity:.3f}")
|
||||
logger.warning(f" Final validation loss: {history.history['val_loss'][-1]:.3f}")
|
||||
logger.warning(f" Final validation perplexity: {validation_perplexity:.3f}")
|
||||
logger.info(f" Final train loss: {train_loss:.3f}")
|
||||
logger.info(f" Final train perplexity: {train_perplexity:.3f}")
|
||||
|
||||
validation_loss = history.history["val_loss"][-1]
|
||||
try:
|
||||
validation_perplexity = math.exp(validation_loss)
|
||||
except OverflowError:
|
||||
validation_perplexity = math.inf
|
||||
logger.info(f" Final validation loss: {validation_loss:.3f}")
|
||||
logger.info(f" Final validation perplexity: {validation_perplexity:.3f}")
|
||||
|
||||
if training_args.output_dir is not None:
|
||||
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||
results_dict = dict()
|
||||
results_dict["train_loss"] = train_loss
|
||||
results_dict["train_perplexity"] = train_perplexity
|
||||
results_dict["eval_loss"] = validation_loss
|
||||
results_dict["eval_perplexity"] = validation_perplexity
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write(json.dumps(results_dict))
|
||||
# endregion
|
||||
|
||||
if training_args.output_dir is not None:
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
if training_args.push_to_hub:
|
||||
# You'll probably want to append some of your own metadata here!
|
||||
model.push_to_hub()
|
||||
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||
# If we're not pushing to hub, at least save a local copy when we're done
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -18,6 +18,7 @@ Fine-tuning the library models for multiple choice.
|
||||
"""
|
||||
# You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
@@ -38,6 +39,7 @@ from transformers import (
|
||||
AutoTokenizer,
|
||||
DefaultDataCollator,
|
||||
HfArgumentParser,
|
||||
PushToHubCallback,
|
||||
TFAutoModelForMultipleChoice,
|
||||
TFTrainingArguments,
|
||||
create_optimizer,
|
||||
@@ -54,16 +56,6 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# region Helper classes and functions
|
||||
class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
||||
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
|
||||
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
|
||||
# that saves the model with this method after each epoch.
|
||||
def __init__(self, output_dir, **kwargs):
|
||||
super().__init__()
|
||||
self.output_dir = output_dir
|
||||
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
self.model.save_pretrained(self.output_dir)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -391,7 +383,6 @@ def main():
|
||||
if "train" not in raw_datasets:
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
train_dataset = raw_datasets["train"]
|
||||
non_label_columns = [feature for feature in train_dataset.features if feature not in ("label", "labels")]
|
||||
if data_args.max_train_samples is not None:
|
||||
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||
train_dataset = train_dataset.select(range(max_train_samples))
|
||||
@@ -407,8 +398,6 @@ def main():
|
||||
if "validation" not in raw_datasets:
|
||||
raise ValueError("--do_eval requires a validation dataset")
|
||||
eval_dataset = raw_datasets["validation"]
|
||||
if not training_args.do_train:
|
||||
non_label_columns = [feature for feature in eval_dataset.features if feature not in ("label", "labels")]
|
||||
if data_args.max_eval_samples is not None:
|
||||
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||
@@ -444,79 +433,120 @@ def main():
|
||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||
|
||||
if training_args.do_train:
|
||||
total_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs)
|
||||
num_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs)
|
||||
if training_args.warmup_steps > 0:
|
||||
num_warmup_steps = training_args.warmup_steps
|
||||
elif training_args.warmup_ratio > 0:
|
||||
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||
else:
|
||||
num_warmup_steps = 0
|
||||
optimizer, lr_schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate, num_train_steps=int(total_train_steps), num_warmup_steps=0
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=num_train_steps,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
adam_beta1=training_args.adam_beta1,
|
||||
adam_beta2=training_args.adam_beta2,
|
||||
adam_epsilon=training_args.adam_epsilon,
|
||||
weight_decay_rate=training_args.weight_decay,
|
||||
adam_global_clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
else:
|
||||
optimizer = "adam" # Just put anything in here, since we're not using it anyway
|
||||
model.compile(
|
||||
optimizer=optimizer,
|
||||
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
|
||||
metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")],
|
||||
)
|
||||
optimizer = None
|
||||
model.compile(optimizer=optimizer, metrics=["accuracy"], jit_compile=training_args.xla)
|
||||
# endregion
|
||||
|
||||
# region Preparing push_to_hub and model card
|
||||
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||
if not push_to_hub_model_id:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-multiplechoice"
|
||||
|
||||
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "multiple-choice"}
|
||||
|
||||
if training_args.push_to_hub:
|
||||
callbacks = [
|
||||
PushToHubCallback(
|
||||
output_dir=training_args.output_dir,
|
||||
model_id=push_to_hub_model_id,
|
||||
organization=training_args.push_to_hub_organization,
|
||||
token=training_args.push_to_hub_token,
|
||||
tokenizer=tokenizer,
|
||||
**model_card_kwargs,
|
||||
)
|
||||
]
|
||||
else:
|
||||
callbacks = []
|
||||
# endregion
|
||||
|
||||
# region Training
|
||||
eval_metrics = None
|
||||
if training_args.do_train:
|
||||
dataset_exclude_cols = set(non_label_columns + ["label"])
|
||||
tf_train_dataset = train_dataset.to_tf_dataset(
|
||||
columns=[col for col in train_dataset.column_names if col not in dataset_exclude_cols],
|
||||
dataset_options = tf.data.Options()
|
||||
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
|
||||
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||
# using model.prepare_tf_dataset()
|
||||
# For more info see the docs:
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||
|
||||
tf_train_dataset = model.prepare_tf_dataset(
|
||||
train_dataset,
|
||||
shuffle=True,
|
||||
batch_size=total_train_batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=True,
|
||||
# `label_cols` is needed for user-defined losses, such as in this example
|
||||
label_cols="label" if "label" in train_dataset.column_names else None,
|
||||
)
|
||||
).with_options(dataset_options)
|
||||
|
||||
if training_args.do_eval:
|
||||
validation_data = eval_dataset.to_tf_dataset(
|
||||
columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols],
|
||||
validation_data = model.prepare_tf_dataset(
|
||||
eval_dataset,
|
||||
shuffle=False,
|
||||
batch_size=total_eval_batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=True,
|
||||
# `label_cols` is needed for user-defined losses, such as in this example
|
||||
label_cols="label" if "label" in eval_dataset.column_names else None,
|
||||
)
|
||||
).with_options(dataset_options)
|
||||
else:
|
||||
validation_data = None
|
||||
model.fit(
|
||||
history = model.fit(
|
||||
tf_train_dataset,
|
||||
validation_data=validation_data,
|
||||
epochs=int(training_args.num_train_epochs),
|
||||
callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
|
||||
callbacks=callbacks,
|
||||
)
|
||||
eval_metrics = {key: val[-1] for key, val in history.history.items()}
|
||||
# endregion
|
||||
|
||||
# region Evaluation
|
||||
if training_args.do_eval and not training_args.do_train:
|
||||
dataset_exclude_cols = set(non_label_columns + ["label"])
|
||||
dataset_options = tf.data.Options()
|
||||
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
# Do a standalone evaluation pass
|
||||
tf_eval_dataset = eval_dataset.to_tf_dataset(
|
||||
columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols],
|
||||
tf_eval_dataset = model.prepare_tf_dataset(
|
||||
eval_dataset,
|
||||
shuffle=False,
|
||||
batch_size=total_eval_batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=True,
|
||||
# `label_cols` is needed for user-defined losses, such as in this example
|
||||
label_cols="label" if "label" in eval_dataset.column_names else None,
|
||||
)
|
||||
model.evaluate(tf_eval_dataset)
|
||||
).with_options(dataset_options)
|
||||
eval_results = model.evaluate(tf_eval_dataset)
|
||||
eval_metrics = {"val_loss": eval_results[0], "val_accuracy": eval_results[1]}
|
||||
# endregion
|
||||
|
||||
if eval_metrics is not None and training_args.output_dir is not None:
|
||||
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write(json.dumps(eval_metrics))
|
||||
|
||||
# region Push to hub
|
||||
if training_args.push_to_hub:
|
||||
model.push_to_hub(
|
||||
finetuned_from=model_args.model_name_or_path,
|
||||
tasks="multiple-choice",
|
||||
dataset_tags="swag",
|
||||
dataset_args="regular",
|
||||
dataset="SWAG",
|
||||
language="en",
|
||||
)
|
||||
|
||||
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||
# If we're not pushing to hub, at least save a local copy when we're done
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
# endregion
|
||||
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ Fine-tuning the library models for question answering.
|
||||
"""
|
||||
# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
@@ -33,13 +34,13 @@ import transformers
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
DataCollatorWithPadding,
|
||||
DefaultDataCollator,
|
||||
EvalPrediction,
|
||||
HfArgumentParser,
|
||||
PreTrainedTokenizerFast,
|
||||
PushToHubCallback,
|
||||
TFAutoModelForQuestionAnswering,
|
||||
TFTrainingArguments,
|
||||
create_optimizer,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
|
||||
@@ -609,7 +610,12 @@ def main():
|
||||
# endregion
|
||||
|
||||
with training_args.strategy.scope():
|
||||
# region Load model
|
||||
|
||||
dataset_options = tf.data.Options()
|
||||
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||
|
||||
# region Load model and prepare datasets
|
||||
if checkpoint is None:
|
||||
model_path = model_args.model_name_or_path
|
||||
else:
|
||||
@@ -621,71 +627,163 @@ def main():
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
optimizer = tf.keras.optimizers.Adam(
|
||||
learning_rate=training_args.learning_rate,
|
||||
beta_1=training_args.adam_beta1,
|
||||
beta_2=training_args.adam_beta2,
|
||||
epsilon=training_args.adam_epsilon,
|
||||
clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
if training_args.do_train:
|
||||
|
||||
training_dataset = model.prepare_tf_dataset(
|
||||
processed_datasets["train"],
|
||||
shuffle=True,
|
||||
batch_size=training_args.per_device_train_batch_size * num_replicas,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
training_dataset = training_dataset.with_options(dataset_options)
|
||||
|
||||
num_train_steps = len(training_dataset) * training_args.num_train_epochs
|
||||
if training_args.warmup_steps > 0:
|
||||
num_warmup_steps = training_args.warmup_steps
|
||||
elif training_args.warmup_ratio > 0:
|
||||
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||
else:
|
||||
num_warmup_steps = 0
|
||||
|
||||
optimizer, schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=len(training_dataset) * training_args.num_train_epochs,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
adam_beta1=training_args.adam_beta1,
|
||||
adam_beta2=training_args.adam_beta2,
|
||||
adam_epsilon=training_args.adam_epsilon,
|
||||
weight_decay_rate=training_args.weight_decay,
|
||||
adam_global_clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
|
||||
# no user-specified loss = will use the model internal loss
|
||||
model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"])
|
||||
|
||||
else:
|
||||
model.compile(optimizer=None, jit_compile=training_args.xla, metrics=["accuracy"])
|
||||
training_dataset = None
|
||||
|
||||
if training_args.do_eval:
|
||||
eval_dataset = model.prepare_tf_dataset(
|
||||
processed_datasets["validation"],
|
||||
shuffle=False,
|
||||
batch_size=training_args.per_device_train_batch_size * num_replicas,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
eval_dataset = eval_dataset.with_options(dataset_options)
|
||||
else:
|
||||
eval_dataset = None
|
||||
|
||||
if training_args.do_predict:
|
||||
predict_dataset = model.prepare_tf_dataset(
|
||||
processed_datasets["test"],
|
||||
shuffle=False,
|
||||
batch_size=training_args.per_device_eval_batch_size * num_replicas,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
predict_dataset = predict_dataset.with_options(dataset_options)
|
||||
else:
|
||||
predict_dataset = None
|
||||
|
||||
# no user-specified loss = will use the model internal loss
|
||||
model.compile(optimizer=optimizer)
|
||||
# endregion
|
||||
|
||||
# region Training
|
||||
if padding:
|
||||
data_collator = DefaultDataCollator(return_tensors="tf")
|
||||
# region Preparing push_to_hub and model card
|
||||
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||
if not push_to_hub_model_id:
|
||||
if data_args.dataset_name is not None:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||
else:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-question-answering"
|
||||
|
||||
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
|
||||
if data_args.dataset_name is not None:
|
||||
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||
if data_args.dataset_config_name is not None:
|
||||
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||
else:
|
||||
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||
|
||||
if training_args.push_to_hub:
|
||||
callbacks = [
|
||||
PushToHubCallback(
|
||||
output_dir=training_args.output_dir,
|
||||
model_id=push_to_hub_model_id,
|
||||
organization=training_args.push_to_hub_organization,
|
||||
token=training_args.push_to_hub_token,
|
||||
tokenizer=tokenizer,
|
||||
**model_card_kwargs,
|
||||
)
|
||||
]
|
||||
else:
|
||||
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
|
||||
tensor_keys = ["attention_mask", "input_ids"]
|
||||
label_keys = ["start_positions", "end_positions"]
|
||||
callbacks = []
|
||||
# endregion
|
||||
|
||||
# region Training and Evaluation
|
||||
|
||||
if training_args.do_train:
|
||||
# Make a tf.data.Dataset for this
|
||||
training_dataset = processed_datasets["train"].to_tf_dataset(
|
||||
# labels are passed as input, as we will use the model's internal loss
|
||||
columns=tensor_keys + label_keys,
|
||||
shuffle=True,
|
||||
batch_size=training_args.per_device_train_batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=True,
|
||||
)
|
||||
model.fit(training_dataset, epochs=int(training_args.num_train_epochs))
|
||||
# endregion
|
||||
# Note that the validation and test datasets have been processed in a different way to the
|
||||
# training datasets in this example, and so they don't have the same label structure.
|
||||
# As such, we don't pass them directly to Keras, but instead get model predictions to evaluate
|
||||
# after training.
|
||||
model.fit(training_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
|
||||
|
||||
# region Evaluation
|
||||
if training_args.do_eval:
|
||||
logger.info("*** Evaluation ***")
|
||||
eval_inputs = {
|
||||
"input_ids": tf.ragged.constant(processed_datasets["validation"]["input_ids"]).to_tensor(),
|
||||
"attention_mask": tf.ragged.constant(processed_datasets["validation"]["attention_mask"]).to_tensor(),
|
||||
}
|
||||
eval_predictions = model.predict(eval_inputs)
|
||||
|
||||
# In this example, we compute advanced metrics at the end of training, but
|
||||
# if you'd like to compute metrics every epoch that are too complex to be written as
|
||||
# standard Keras metrics, you can use our KerasMetricCallback. See
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
|
||||
|
||||
eval_predictions = model.predict(eval_dataset)
|
||||
if isinstance(eval_predictions.start_logits, tf.RaggedTensor):
|
||||
# If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
|
||||
# The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
|
||||
# the highest probability in a sample. Instead, we use a large negative value, which ensures that the
|
||||
# padding positions are correctly masked.
|
||||
eval_start_logits = eval_predictions.start_logits.to_tensor(default_value=-1000).numpy()
|
||||
eval_end_logits = eval_predictions.end_logits.to_tensor(default_value=-1000).numpy()
|
||||
else:
|
||||
eval_start_logits = eval_predictions.start_logits
|
||||
eval_end_logits = eval_predictions.end_logits
|
||||
|
||||
post_processed_eval = post_processing_function(
|
||||
datasets["validation"],
|
||||
processed_datasets["validation"],
|
||||
(eval_predictions.start_logits, eval_predictions.end_logits),
|
||||
(eval_start_logits, eval_end_logits),
|
||||
)
|
||||
metrics = compute_metrics(post_processed_eval)
|
||||
logging.info("Evaluation metrics:")
|
||||
for metric, value in metrics.items():
|
||||
logging.info(f"{metric}: {value:.3f}")
|
||||
if training_args.output_dir is not None:
|
||||
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write(json.dumps(metrics))
|
||||
# endregion
|
||||
|
||||
# region Prediction
|
||||
if training_args.do_predict:
|
||||
logger.info("*** Predict ***")
|
||||
predict_inputs = {
|
||||
"input_ids": tf.ragged.constant(processed_datasets["test"]["input_ids"]).to_tensor(),
|
||||
"attention_mask": tf.ragged.constant(processed_datasets["test"]["attention_mask"]).to_tensor(),
|
||||
}
|
||||
test_predictions = model.predict(predict_inputs)
|
||||
|
||||
test_predictions = model.predict(predict_dataset)
|
||||
if isinstance(test_predictions.start_logits, tf.RaggedTensor):
|
||||
# If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
|
||||
# The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
|
||||
# the highest probability in a sample. Instead, we use a large negative value, which ensures that the
|
||||
# padding positions are correctly masked.
|
||||
test_start_logits = test_predictions.start_logits.to_tensor(default_value=-1000).numpy()
|
||||
test_end_logits = test_predictions.end_logits.to_tensor(default_value=-1000).numpy()
|
||||
else:
|
||||
test_start_logits = test_predictions.start_logits
|
||||
test_end_logits = test_predictions.end_logits
|
||||
post_processed_test = post_processing_function(
|
||||
datasets["test"],
|
||||
processed_datasets["test"],
|
||||
(test_predictions.start_logits, test_predictions.end_logits),
|
||||
(test_start_logits, test_end_logits),
|
||||
)
|
||||
metrics = compute_metrics(post_processed_test)
|
||||
|
||||
@@ -694,8 +792,9 @@ def main():
|
||||
logging.info(f"{metric}: {value:.3f}")
|
||||
# endregion
|
||||
|
||||
if training_args.push_to_hub:
|
||||
model.push_to_hub()
|
||||
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||
# If we're not pushing to hub, at least save a local copy when we're done
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -18,11 +18,11 @@ Fine-tuning the library models for summarization.
|
||||
"""
|
||||
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from functools import partial
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
@@ -30,7 +30,6 @@ import nltk # Here to have a nice missing dependency error message early on
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
import evaluate
|
||||
import transformers
|
||||
@@ -38,7 +37,10 @@ from filelock import FileLock
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
DataCollatorForSeq2Seq,
|
||||
HfArgumentParser,
|
||||
KerasMetricCallback,
|
||||
PushToHubCallback,
|
||||
TFAutoModelForSeq2SeqLM,
|
||||
TFTrainingArguments,
|
||||
create_optimizer,
|
||||
@@ -253,7 +255,6 @@ class DataTrainingArguments:
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
# region Dataset name mappings
|
||||
summarization_name_mapping = {
|
||||
"amazon_reviews_multi": ("review_body", "review_title"),
|
||||
@@ -272,71 +273,6 @@ summarization_name_mapping = {
|
||||
# endregion
|
||||
|
||||
|
||||
# region Data generator
|
||||
def sample_generator(dataset, model, tokenizer, shuffle, pad_to_multiple_of=None):
|
||||
if shuffle:
|
||||
sample_ordering = np.random.permutation(len(dataset))
|
||||
else:
|
||||
sample_ordering = np.arange(len(dataset))
|
||||
for sample_idx in sample_ordering:
|
||||
example = dataset[int(sample_idx)]
|
||||
# Handle dicts with proper padding and conversion to tensor.
|
||||
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
|
||||
example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()}
|
||||
if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
|
||||
decoder_input_ids = model.prepare_decoder_input_ids_from_labels(
|
||||
labels=tf.expand_dims(example["labels"], 0)
|
||||
)
|
||||
example["decoder_input_ids"] = tf.squeeze(decoder_input_ids, 0)
|
||||
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
|
||||
return
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
# region Helper functions
|
||||
def dataset_to_tf(dataset, model, tokenizer, total_batch_size, num_epochs, shuffle):
|
||||
if dataset is None:
|
||||
return None
|
||||
train_generator = partial(sample_generator, dataset, model, tokenizer, shuffle=shuffle)
|
||||
train_signature = {
|
||||
feature: tf.TensorSpec(shape=(None,), dtype=tf.int32)
|
||||
for feature in dataset.features
|
||||
if feature != "special_tokens_mask"
|
||||
}
|
||||
if (
|
||||
model is not None
|
||||
and "decoder_input_ids" not in train_signature
|
||||
and hasattr(model, "prepare_decoder_input_ids_from_labels")
|
||||
):
|
||||
train_signature["decoder_input_ids"] = train_signature["labels"]
|
||||
# This may need to be changed depending on your particular model or tokenizer!
|
||||
padding_values = {
|
||||
key: tf.convert_to_tensor(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0, dtype=tf.int32)
|
||||
for key in train_signature.keys()
|
||||
}
|
||||
padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int32)
|
||||
train_signature["labels"] = train_signature["input_ids"]
|
||||
train_signature = (train_signature, train_signature["labels"])
|
||||
options = tf.data.Options()
|
||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
tf_dataset = (
|
||||
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
|
||||
.with_options(options)
|
||||
.padded_batch(
|
||||
batch_size=total_batch_size,
|
||||
drop_remainder=True,
|
||||
padding_values=(padding_values, np.array(-100, dtype=np.int32)),
|
||||
)
|
||||
.repeat(int(num_epochs))
|
||||
)
|
||||
return tf_dataset
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
def main():
|
||||
# region Argument parsing
|
||||
# See all possible arguments in src/transformers/training_args.py
|
||||
@@ -587,59 +523,148 @@ def main():
|
||||
if model.config.decoder_start_token_id is None:
|
||||
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
|
||||
|
||||
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
||||
data_collator = DataCollatorForSeq2Seq(
|
||||
tokenizer,
|
||||
model=model,
|
||||
label_pad_token_id=label_pad_token_id,
|
||||
pad_to_multiple_of=128, # Reduce the number of unique shapes for XLA, especially for generation
|
||||
return_tensors="tf",
|
||||
)
|
||||
|
||||
dataset_options = tf.data.Options()
|
||||
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
|
||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||
tf_train_dataset = dataset_to_tf(
|
||||
|
||||
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||
# using model.prepare_tf_dataset()
|
||||
# For more info see the docs:
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||
|
||||
tf_train_dataset = model.prepare_tf_dataset(
|
||||
train_dataset,
|
||||
model,
|
||||
tokenizer,
|
||||
total_batch_size=total_train_batch_size,
|
||||
num_epochs=training_args.num_train_epochs,
|
||||
collate_fn=data_collator,
|
||||
batch_size=total_train_batch_size,
|
||||
shuffle=True,
|
||||
)
|
||||
tf_eval_dataset = dataset_to_tf(
|
||||
).with_options(dataset_options)
|
||||
tf_eval_dataset = model.prepare_tf_dataset(
|
||||
eval_dataset,
|
||||
model,
|
||||
tokenizer,
|
||||
total_eval_batch_size,
|
||||
num_epochs=1,
|
||||
collate_fn=data_collator,
|
||||
batch_size=total_eval_batch_size,
|
||||
shuffle=False,
|
||||
)
|
||||
).with_options(dataset_options)
|
||||
# endregion
|
||||
|
||||
# region Optimizer, loss and LR scheduling
|
||||
# Scheduler and math around the number of training steps.
|
||||
num_update_steps_per_epoch = len(train_dataset) // total_train_batch_size
|
||||
num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch
|
||||
optimizer, lr_schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=0
|
||||
)
|
||||
|
||||
def masked_sparse_categorical_crossentropy(y_true, y_pred):
|
||||
# We clip the negative labels to 0 to avoid NaNs appearing in the output and
|
||||
# fouling up everything that comes afterwards. The loss values corresponding to clipped values
|
||||
# will be masked later anyway, but even masked NaNs seem to cause overflows for some reason.
|
||||
# 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely
|
||||
# event that you have more than 1 million tokens in your vocabulary, consider increasing this value.
|
||||
# More pragmatically, consider redesigning your tokenizer.
|
||||
losses = tf.keras.losses.sparse_categorical_crossentropy(
|
||||
tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True
|
||||
num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
|
||||
if training_args.warmup_steps > 0:
|
||||
num_warmup_steps = training_args.warmup_steps
|
||||
elif training_args.warmup_ratio > 0:
|
||||
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||
else:
|
||||
num_warmup_steps = 0
|
||||
if training_args.do_train:
|
||||
optimizer, lr_schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=num_train_steps,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
adam_beta1=training_args.adam_beta1,
|
||||
adam_beta2=training_args.adam_beta2,
|
||||
adam_epsilon=training_args.adam_epsilon,
|
||||
weight_decay_rate=training_args.weight_decay,
|
||||
adam_global_clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
# Compute the per-sample loss only over the unmasked tokens
|
||||
losses = tf.ragged.boolean_mask(losses, y_true != -100)
|
||||
losses = tf.reduce_mean(losses, axis=-1)
|
||||
return losses
|
||||
else:
|
||||
optimizer = None
|
||||
|
||||
# endregion
|
||||
|
||||
# region Metric
|
||||
metric = evaluate.load("rouge")
|
||||
# region Metric and KerasMetricCallback
|
||||
if training_args.do_eval:
|
||||
metric = evaluate.load("rouge")
|
||||
|
||||
if data_args.val_max_target_length is None:
|
||||
data_args.val_max_target_length = data_args.max_target_length
|
||||
|
||||
gen_kwargs = {
|
||||
"max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
|
||||
"num_beams": data_args.num_beams,
|
||||
"no_repeat_ngram_size": 0, # Not supported under XLA right now, and some models set it by default
|
||||
}
|
||||
|
||||
def compute_metrics(preds):
|
||||
predictions, labels = preds
|
||||
if isinstance(predictions, tuple):
|
||||
predictions = predictions[0]
|
||||
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
|
||||
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
|
||||
metrics = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
|
||||
# Only print the mid f-measures, but there are a lot of other statistics in there too!
|
||||
metrics = {key: round(val.mid.fmeasure * 100, 4) for key, val in metrics.items()}
|
||||
return metrics
|
||||
|
||||
# The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics
|
||||
# to be computed each epoch. Any Python code can be included in the metric_fn. This is especially
|
||||
# useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs.
|
||||
# For more information, see the docs at
|
||||
# https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback
|
||||
|
||||
metric_callback = KerasMetricCallback(
|
||||
metric_fn=compute_metrics,
|
||||
eval_dataset=tf_eval_dataset,
|
||||
predict_with_generate=True,
|
||||
use_xla_generation=True,
|
||||
generate_kwargs=gen_kwargs,
|
||||
)
|
||||
callbacks = [metric_callback]
|
||||
else:
|
||||
callbacks = []
|
||||
# endregion
|
||||
|
||||
# region Preparing push_to_hub and model card
|
||||
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||
if not push_to_hub_model_id:
|
||||
if data_args.dataset_name is not None:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||
else:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-summarization"
|
||||
|
||||
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"}
|
||||
if data_args.dataset_name is not None:
|
||||
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||
if data_args.dataset_config_name is not None:
|
||||
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||
else:
|
||||
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||
|
||||
if training_args.push_to_hub:
|
||||
# Because this training can be quite long, we save once per epoch.
|
||||
callbacks.append(
|
||||
PushToHubCallback(
|
||||
output_dir=training_args.output_dir,
|
||||
model_id=push_to_hub_model_id,
|
||||
organization=training_args.push_to_hub_organization,
|
||||
token=training_args.push_to_hub_token,
|
||||
tokenizer=tokenizer,
|
||||
**model_card_kwargs,
|
||||
)
|
||||
)
|
||||
# endregion
|
||||
|
||||
# region Training
|
||||
model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer)
|
||||
|
||||
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
|
||||
eval_metrics = None
|
||||
if training_args.do_train:
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(f" Num examples = {len(train_dataset)}")
|
||||
@@ -648,28 +673,29 @@ def main():
|
||||
logger.info(f" Total train batch size = {total_train_batch_size}")
|
||||
logger.info(f" Total optimization steps = {num_train_steps}")
|
||||
|
||||
model.fit(
|
||||
tf_train_dataset,
|
||||
epochs=int(training_args.num_train_epochs),
|
||||
steps_per_epoch=num_update_steps_per_epoch,
|
||||
)
|
||||
if training_args.xla and not data_args.pad_to_max_length:
|
||||
logger.warning(
|
||||
"XLA training may be slow at first when --pad_to_max_length is not set "
|
||||
"until all possible shapes have been compiled."
|
||||
)
|
||||
history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
|
||||
eval_metrics = {key: val[-1] for key, val in history.history.items()}
|
||||
# endregion
|
||||
|
||||
# region Validation
|
||||
if data_args.val_max_target_length is None:
|
||||
data_args.val_max_target_length = data_args.max_target_length
|
||||
|
||||
gen_kwargs = {
|
||||
"max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
|
||||
"num_beams": data_args.num_beams,
|
||||
}
|
||||
if training_args.do_eval:
|
||||
if training_args.do_eval and not training_args.do_train:
|
||||
# Do a standalone evaluation run
|
||||
logger.info("Evaluation...")
|
||||
for batch, labels in tqdm(
|
||||
tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size
|
||||
):
|
||||
|
||||
# Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate
|
||||
@tf.function(jit_compile=True)
|
||||
def generate(**kwargs):
|
||||
return model.generate(**kwargs)
|
||||
|
||||
for batch, labels in tf_eval_dataset:
|
||||
batch.update(gen_kwargs)
|
||||
generated_tokens = model.generate(**batch)
|
||||
generated_tokens = generate(**batch)
|
||||
if isinstance(generated_tokens, tuple):
|
||||
generated_tokens = generated_tokens[0]
|
||||
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
||||
@@ -679,13 +705,19 @@ def main():
|
||||
|
||||
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
|
||||
|
||||
result = metric.compute(use_stemmer=True)
|
||||
result = {k: round(v * 100, 4) for k, v in result.items()}
|
||||
eval_metrics = metric.compute(use_stemmer=True)
|
||||
|
||||
result = {key: round(val.mid.fmeasure * 100, 4) for key, val in eval_metrics.items()}
|
||||
logger.info(result)
|
||||
# endregion
|
||||
|
||||
if training_args.output_dir is not None:
|
||||
if training_args.output_dir is not None and eval_metrics is not None:
|
||||
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write(json.dumps(eval_metrics))
|
||||
|
||||
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||
# If we're not pushing to hub, at least save a local copy when we're done
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
|
||||
|
||||
295
examples/tensorflow/test_tensorflow_examples.py
Normal file
295
examples/tensorflow/test_tensorflow_examples.py
Normal file
@@ -0,0 +1,295 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2022 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from unittest import skip
|
||||
from unittest.mock import patch
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from transformers.testing_utils import TestCasePlus, get_gpu_count, slow
|
||||
|
||||
|
||||
SRC_DIRS = [
|
||||
os.path.join(os.path.dirname(__file__), dirname)
|
||||
for dirname in [
|
||||
"text-generation",
|
||||
"text-classification",
|
||||
"token-classification",
|
||||
"language-modeling",
|
||||
"multiple-choice",
|
||||
"question-answering",
|
||||
"summarization",
|
||||
"translation",
|
||||
]
|
||||
]
|
||||
sys.path.extend(SRC_DIRS)
|
||||
|
||||
|
||||
if SRC_DIRS is not None:
|
||||
import run_clm
|
||||
import run_mlm
|
||||
import run_ner
|
||||
import run_qa as run_squad
|
||||
import run_summarization
|
||||
import run_swag
|
||||
import run_text_classification
|
||||
import run_translation
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
def get_setup_file():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-f")
|
||||
args = parser.parse_args()
|
||||
return args.f
|
||||
|
||||
|
||||
def get_results(output_dir):
|
||||
results = {}
|
||||
path = os.path.join(output_dir, "all_results.json")
|
||||
if os.path.exists(path):
|
||||
with open(path, "r") as f:
|
||||
results = json.load(f)
|
||||
else:
|
||||
raise ValueError(f"can't find {path}")
|
||||
return results
|
||||
|
||||
|
||||
def is_cuda_available():
|
||||
return bool(tf.config.list_physical_devices("GPU"))
|
||||
|
||||
|
||||
stream_handler = logging.StreamHandler(sys.stdout)
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
|
||||
class ExamplesTests(TestCasePlus):
|
||||
@skip("Skipping until shape inference for to_tf_dataset PR is merged.")
|
||||
def test_run_text_classification(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
run_text_classification.py
|
||||
--model_name_or_path distilbert-base-uncased
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
--train_file ./tests/fixtures/tests_samples/MRPC/train.csv
|
||||
--validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv
|
||||
--do_train
|
||||
--do_eval
|
||||
--per_device_train_batch_size=2
|
||||
--per_device_eval_batch_size=1
|
||||
--learning_rate=1e-4
|
||||
--max_steps=10
|
||||
--warmup_steps=2
|
||||
--seed=42
|
||||
--max_seq_length=128
|
||||
""".split()
|
||||
|
||||
if is_cuda_available():
|
||||
testargs.append("--fp16")
|
||||
|
||||
with patch.object(sys, "argv", testargs):
|
||||
run_text_classification.main()
|
||||
# Reset the mixed precision policy so we don't break other tests
|
||||
tf.keras.mixed_precision.set_global_policy("float32")
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["eval_accuracy"], 0.75)
|
||||
|
||||
def test_run_clm(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
run_clm.py
|
||||
--model_name_or_path distilgpt2
|
||||
--train_file ./tests/fixtures/sample_text.txt
|
||||
--validation_file ./tests/fixtures/sample_text.txt
|
||||
--do_train
|
||||
--do_eval
|
||||
--block_size 128
|
||||
--per_device_train_batch_size 2
|
||||
--per_device_eval_batch_size 1
|
||||
--num_train_epochs 2
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
""".split()
|
||||
|
||||
if len(tf.config.list_physical_devices("GPU")) > 1:
|
||||
# Skipping because there are not enough batches to train the model + would need a drop_last to work.
|
||||
return
|
||||
|
||||
with patch.object(sys, "argv", testargs):
|
||||
run_clm.main()
|
||||
result = get_results(tmp_dir)
|
||||
self.assertLess(result["eval_perplexity"], 100)
|
||||
|
||||
def test_run_mlm(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
run_mlm.py
|
||||
--model_name_or_path distilroberta-base
|
||||
--train_file ./tests/fixtures/sample_text.txt
|
||||
--validation_file ./tests/fixtures/sample_text.txt
|
||||
--max_seq_length 64
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
--do_train
|
||||
--do_eval
|
||||
--prediction_loss_only
|
||||
--num_train_epochs=1
|
||||
""".split()
|
||||
|
||||
with patch.object(sys, "argv", testargs):
|
||||
run_mlm.main()
|
||||
result = get_results(tmp_dir)
|
||||
self.assertLess(result["eval_perplexity"], 42)
|
||||
|
||||
def test_run_ner(self):
|
||||
# with so little data distributed training needs more epochs to get the score on par with 0/1 gpu
|
||||
epochs = 7 if get_gpu_count() > 1 else 2
|
||||
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
run_ner.py
|
||||
--model_name_or_path bert-base-uncased
|
||||
--train_file tests/fixtures/tests_samples/conll/sample.json
|
||||
--validation_file tests/fixtures/tests_samples/conll/sample.json
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
--do_train
|
||||
--do_eval
|
||||
--warmup_steps=2
|
||||
--learning_rate=2e-4
|
||||
--per_device_train_batch_size=2
|
||||
--per_device_eval_batch_size=2
|
||||
--num_train_epochs={epochs}
|
||||
--seed 7
|
||||
""".split()
|
||||
|
||||
with patch.object(sys, "argv", testargs):
|
||||
run_ner.main()
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["accuracy"], 0.75)
|
||||
|
||||
def test_run_squad(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
run_qa.py
|
||||
--model_name_or_path bert-base-uncased
|
||||
--version_2_with_negative
|
||||
--train_file tests/fixtures/tests_samples/SQUAD/sample.json
|
||||
--validation_file tests/fixtures/tests_samples/SQUAD/sample.json
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
--max_steps=10
|
||||
--warmup_steps=2
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate=2e-4
|
||||
--per_device_train_batch_size=2
|
||||
--per_device_eval_batch_size=1
|
||||
""".split()
|
||||
|
||||
with patch.object(sys, "argv", testargs):
|
||||
run_squad.main()
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["f1"], 30)
|
||||
self.assertGreaterEqual(result["exact"], 30)
|
||||
|
||||
def test_run_swag(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
run_swag.py
|
||||
--model_name_or_path bert-base-uncased
|
||||
--train_file tests/fixtures/tests_samples/swag/sample.json
|
||||
--validation_file tests/fixtures/tests_samples/swag/sample.json
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
--max_steps=20
|
||||
--warmup_steps=2
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate=2e-4
|
||||
--per_device_train_batch_size=2
|
||||
--per_device_eval_batch_size=1
|
||||
""".split()
|
||||
|
||||
with patch.object(sys, "argv", testargs):
|
||||
run_swag.main()
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["val_accuracy"], 0.8)
|
||||
|
||||
@slow
|
||||
def test_run_summarization(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
run_summarization.py
|
||||
--model_name_or_path t5-small
|
||||
--train_file tests/fixtures/tests_samples/xsum/sample.json
|
||||
--validation_file tests/fixtures/tests_samples/xsum/sample.json
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
--max_steps=50
|
||||
--warmup_steps=8
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate=2e-4
|
||||
--per_device_train_batch_size=2
|
||||
--per_device_eval_batch_size=1
|
||||
""".split()
|
||||
|
||||
with patch.object(sys, "argv", testargs):
|
||||
run_summarization.main()
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["rouge1"], 10)
|
||||
self.assertGreaterEqual(result["rouge2"], 2)
|
||||
self.assertGreaterEqual(result["rougeL"], 7)
|
||||
self.assertGreaterEqual(result["rougeLsum"], 7)
|
||||
|
||||
@slow
|
||||
def test_run_translation(self):
|
||||
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||
testargs = f"""
|
||||
run_translation.py
|
||||
--model_name_or_path Rocketknight1/student_marian_en_ro_6_1
|
||||
--source_lang en
|
||||
--target_lang ro
|
||||
--train_file tests/fixtures/tests_samples/wmt16/sample.json
|
||||
--validation_file tests/fixtures/tests_samples/wmt16/sample.json
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
--warmup_steps=8
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate=3e-3
|
||||
--num_train_epochs 12
|
||||
--per_device_train_batch_size=2
|
||||
--per_device_eval_batch_size=1
|
||||
--source_lang en_XX
|
||||
--target_lang ro_RO
|
||||
""".split()
|
||||
|
||||
with patch.object(sys, "argv", testargs):
|
||||
run_translation.main()
|
||||
result = get_results(tmp_dir)
|
||||
self.assertGreaterEqual(result["bleu"], 30)
|
||||
@@ -16,6 +16,7 @@
|
||||
""" Finetuning the library models for sequence classification on GLUE."""
|
||||
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
@@ -35,32 +36,16 @@ from transformers import (
|
||||
DefaultDataCollator,
|
||||
HfArgumentParser,
|
||||
PretrainedConfig,
|
||||
PushToHubCallback,
|
||||
TFAutoModelForSequenceClassification,
|
||||
TFTrainingArguments,
|
||||
create_optimizer,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.trainer_utils import get_last_checkpoint, is_main_process
|
||||
from transformers.utils import check_min_version, send_example_telemetry
|
||||
|
||||
|
||||
# region Helper functions
|
||||
|
||||
|
||||
class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
||||
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
|
||||
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
|
||||
# that saves the model with this method after each epoch.
|
||||
def __init__(self, output_dir, **kwargs):
|
||||
super().__init__()
|
||||
self.output_dir = output_dir
|
||||
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
self.model.save_pretrained(self.output_dir)
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.22.0.dev0")
|
||||
|
||||
@@ -312,7 +297,6 @@ def main():
|
||||
|
||||
# region Dataset preprocessing
|
||||
sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
|
||||
non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
|
||||
|
||||
# Padding strategy
|
||||
if data_args.pad_to_max_length:
|
||||
@@ -394,24 +378,11 @@ def main():
|
||||
)
|
||||
# endregion
|
||||
|
||||
# region Optimizer, loss and compilation
|
||||
optimizer = tf.keras.optimizers.Adam(
|
||||
learning_rate=training_args.learning_rate,
|
||||
beta_1=training_args.adam_beta1,
|
||||
beta_2=training_args.adam_beta2,
|
||||
epsilon=training_args.adam_epsilon,
|
||||
clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
if is_regression:
|
||||
loss_fn = tf.keras.losses.MeanSquaredError()
|
||||
metrics = []
|
||||
else:
|
||||
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
||||
metrics = ["accuracy"]
|
||||
model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
|
||||
# endregion
|
||||
|
||||
# region Convert data to a tf.data.Dataset
|
||||
dataset_options = tf.data.Options()
|
||||
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||
|
||||
tf_data = dict()
|
||||
max_samples = {
|
||||
"train": data_args.max_train_samples,
|
||||
@@ -428,31 +399,89 @@ def main():
|
||||
assert "label" in datasets[key].features, f"Missing labels from {key} data!"
|
||||
if key == "train":
|
||||
shuffle = True
|
||||
batch_size = training_args.per_device_train_batch_size
|
||||
drop_remainder = True # Saves us worrying about scaling gradients for the last batch
|
||||
batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||
else:
|
||||
shuffle = False
|
||||
batch_size = training_args.per_device_eval_batch_size
|
||||
drop_remainder = False
|
||||
batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||
samples_limit = max_samples[key]
|
||||
dataset = datasets[key]
|
||||
if samples_limit is not None:
|
||||
dataset = dataset.select(range(samples_limit))
|
||||
data = dataset.to_tf_dataset(
|
||||
columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
|
||||
|
||||
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||
# using model.prepare_tf_dataset()
|
||||
# For more info see the docs:
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||
data = model.prepare_tf_dataset(
|
||||
dataset,
|
||||
shuffle=shuffle,
|
||||
batch_size=batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=drop_remainder,
|
||||
# `label_cols` is needed for user-defined losses, such as in this example
|
||||
label_cols="label" if "label" in dataset.column_names else None,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
data = data.with_options(dataset_options)
|
||||
tf_data[key] = data
|
||||
# endregion
|
||||
|
||||
# region Optimizer, loss and compilation
|
||||
if training_args.do_train:
|
||||
num_train_steps = len(tf_data["train"]) * training_args.num_train_epochs
|
||||
if training_args.warmup_steps > 0:
|
||||
num_warmup_steps = training_args.warmup_steps
|
||||
elif training_args.warmup_ratio > 0:
|
||||
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||
else:
|
||||
num_warmup_steps = 0
|
||||
|
||||
optimizer, schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=num_train_steps,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
adam_beta1=training_args.adam_beta1,
|
||||
adam_beta2=training_args.adam_beta2,
|
||||
adam_epsilon=training_args.adam_epsilon,
|
||||
weight_decay_rate=training_args.weight_decay,
|
||||
adam_global_clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
else:
|
||||
optimizer = "adam" # Just write anything because we won't be using it
|
||||
if is_regression:
|
||||
metrics = []
|
||||
else:
|
||||
metrics = ["accuracy"]
|
||||
model.compile(optimizer=optimizer, metrics=metrics, jit_compile=training_args.xla)
|
||||
# endregion
|
||||
|
||||
# region Preparing push_to_hub and model card
|
||||
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||
if not push_to_hub_model_id:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-glue"
|
||||
|
||||
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
|
||||
model_card_kwargs["task_name"] = data_args.task_name
|
||||
|
||||
if training_args.push_to_hub:
|
||||
callbacks = [
|
||||
PushToHubCallback(
|
||||
output_dir=training_args.output_dir,
|
||||
model_id=push_to_hub_model_id,
|
||||
organization=training_args.push_to_hub_organization,
|
||||
token=training_args.push_to_hub_token,
|
||||
tokenizer=tokenizer,
|
||||
**model_card_kwargs,
|
||||
)
|
||||
]
|
||||
else:
|
||||
callbacks = []
|
||||
# endregion
|
||||
|
||||
# region Training and validation
|
||||
if training_args.do_train:
|
||||
callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)]
|
||||
if training_args.do_eval and not data_args.task_name == "mnli":
|
||||
# Do both evaluation and training in the Keras fit loop, unless the task is MNLI
|
||||
# because MNLI has two validation sets
|
||||
@@ -472,6 +501,12 @@ def main():
|
||||
# We normally do validation as part of the Keras fit loop, but we run it independently
|
||||
# if there was no fit() step (because we didn't train the model) or if the task is MNLI,
|
||||
# because MNLI has a separate validation-mismatched validation set
|
||||
|
||||
# In this example, we compute advanced metrics only at the end of training, and only compute
|
||||
# loss and accuracy on the validation set each epoch, but
|
||||
# if you'd like to compute metrics every epoch that are too complex to be written as
|
||||
# standard Keras metrics, you can use our KerasMetricCallback. See
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
|
||||
logger.info("*** Evaluate ***")
|
||||
|
||||
# Loop to handle MNLI double evaluation (matched, mis-matched)
|
||||
@@ -489,6 +524,10 @@ def main():
|
||||
eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"])
|
||||
print(f"Evaluation metrics ({task}):")
|
||||
print(eval_metrics)
|
||||
if training_args.output_dir is not None:
|
||||
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write(json.dumps(eval_metrics))
|
||||
|
||||
# endregion
|
||||
|
||||
@@ -538,6 +577,10 @@ def main():
|
||||
writer.write(f"{index}\t{item}\n")
|
||||
# endregion
|
||||
|
||||
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||
# If we're not pushing to hub, at least save a local copy when we're done
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
""" Fine-tuning the library models for sequence classification."""
|
||||
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
@@ -29,12 +30,12 @@ from datasets import load_dataset
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
DataCollatorWithPadding,
|
||||
DefaultDataCollator,
|
||||
HfArgumentParser,
|
||||
PretrainedConfig,
|
||||
PushToHubCallback,
|
||||
TFAutoModelForSequenceClassification,
|
||||
TFTrainingArguments,
|
||||
create_optimizer,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, send_example_telemetry
|
||||
@@ -383,10 +384,6 @@ def main():
|
||||
|
||||
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
|
||||
|
||||
if data_args.pad_to_max_length:
|
||||
data_collator = DefaultDataCollator(return_tensors="tf")
|
||||
else:
|
||||
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
|
||||
# endregion
|
||||
|
||||
with training_args.strategy.scope():
|
||||
@@ -409,24 +406,10 @@ def main():
|
||||
)
|
||||
# endregion
|
||||
|
||||
# region Optimizer, loss and compilation
|
||||
optimizer = tf.keras.optimizers.Adam(
|
||||
learning_rate=training_args.learning_rate,
|
||||
beta_1=training_args.adam_beta1,
|
||||
beta_2=training_args.adam_beta2,
|
||||
epsilon=training_args.adam_epsilon,
|
||||
clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
if is_regression:
|
||||
loss_fn = tf.keras.losses.MeanSquaredError()
|
||||
metrics = []
|
||||
else:
|
||||
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
||||
metrics = ["accuracy"]
|
||||
model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
|
||||
# endregion
|
||||
|
||||
# region Convert data to a tf.data.Dataset
|
||||
dataset_options = tf.data.Options()
|
||||
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||
|
||||
tf_data = dict()
|
||||
max_samples = {
|
||||
@@ -438,50 +421,121 @@ def main():
|
||||
if key not in datasets:
|
||||
tf_data[key] = None
|
||||
continue
|
||||
if (
|
||||
(key == "train" and not training_args.do_train)
|
||||
or (key == "validation" and not training_args.do_eval)
|
||||
or (key == "test" and not training_args.do_predict)
|
||||
):
|
||||
tf_data[key] = None
|
||||
continue
|
||||
if key in ("train", "validation"):
|
||||
assert "label" in datasets[key].features, f"Missing labels from {key} data!"
|
||||
if key == "train":
|
||||
shuffle = True
|
||||
batch_size = training_args.per_device_train_batch_size
|
||||
drop_remainder = True # Saves us worrying about scaling gradients for the last batch
|
||||
batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||
else:
|
||||
shuffle = False
|
||||
batch_size = training_args.per_device_eval_batch_size
|
||||
drop_remainder = False
|
||||
batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||
samples_limit = max_samples[key]
|
||||
dataset = datasets[key]
|
||||
if samples_limit is not None:
|
||||
dataset = dataset.select(range(samples_limit))
|
||||
data = dataset.to_tf_dataset(
|
||||
columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
|
||||
|
||||
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||
# using model.prepare_tf_dataset()
|
||||
# For more info see the docs:
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||
|
||||
data = model.prepare_tf_dataset(
|
||||
dataset,
|
||||
shuffle=shuffle,
|
||||
batch_size=batch_size,
|
||||
collate_fn=data_collator,
|
||||
drop_remainder=drop_remainder,
|
||||
# `label_cols` is needed for user-defined losses, such as in this example
|
||||
label_cols="label" if "label" in dataset.column_names else None,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
data = data.with_options(dataset_options)
|
||||
tf_data[key] = data
|
||||
# endregion
|
||||
|
||||
# region Optimizer, loss and compilation
|
||||
|
||||
if training_args.do_train:
|
||||
num_train_steps = len(tf_data["train"]) * training_args.num_train_epochs
|
||||
if training_args.warmup_steps > 0:
|
||||
num_warmup_steps = training_args.warmup_steps
|
||||
elif training_args.warmup_ratio > 0:
|
||||
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||
else:
|
||||
num_warmup_steps = 0
|
||||
|
||||
optimizer, schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=num_train_steps,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
adam_beta1=training_args.adam_beta1,
|
||||
adam_beta2=training_args.adam_beta2,
|
||||
adam_epsilon=training_args.adam_epsilon,
|
||||
weight_decay_rate=training_args.weight_decay,
|
||||
adam_global_clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
else:
|
||||
optimizer = None
|
||||
if is_regression:
|
||||
metrics = []
|
||||
else:
|
||||
metrics = ["accuracy"]
|
||||
model.compile(optimizer=optimizer, metrics=metrics)
|
||||
# endregion
|
||||
|
||||
# region Preparing push_to_hub and model card
|
||||
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||
if not push_to_hub_model_id:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-text-classification"
|
||||
|
||||
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
|
||||
|
||||
if training_args.push_to_hub:
|
||||
callbacks = [
|
||||
PushToHubCallback(
|
||||
output_dir=training_args.output_dir,
|
||||
model_id=push_to_hub_model_id,
|
||||
organization=training_args.push_to_hub_organization,
|
||||
token=training_args.push_to_hub_token,
|
||||
tokenizer=tokenizer,
|
||||
**model_card_kwargs,
|
||||
)
|
||||
]
|
||||
else:
|
||||
callbacks = []
|
||||
# endregion
|
||||
|
||||
# region Training and validation
|
||||
if tf_data["train"] is not None:
|
||||
callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)]
|
||||
model.fit(
|
||||
tf_data["train"],
|
||||
validation_data=tf_data["validation"],
|
||||
epochs=int(training_args.num_train_epochs),
|
||||
callbacks=callbacks,
|
||||
)
|
||||
elif tf_data["validation"] is not None:
|
||||
# If there's a validation dataset but no training set, just evaluate the metrics
|
||||
if tf_data["validation"] is not None:
|
||||
logger.info("Computing metrics on validation data...")
|
||||
if is_regression:
|
||||
loss = model.evaluate(tf_data["validation"])
|
||||
logger.info(f"Loss: {loss:.5f}")
|
||||
logger.info(f"Eval loss: {loss:.5f}")
|
||||
else:
|
||||
loss, accuracy = model.evaluate(tf_data["validation"])
|
||||
logger.info(f"Loss: {loss:.5f}, Accuracy: {accuracy * 100:.4f}%")
|
||||
logger.info(f"Eval loss: {loss:.5f}, Eval accuracy: {accuracy * 100:.4f}%")
|
||||
if training_args.output_dir is not None:
|
||||
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||
eval_dict = {"eval_loss": loss}
|
||||
if not is_regression:
|
||||
eval_dict["eval_accuracy"] = accuracy
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write(json.dumps(eval_dict))
|
||||
# endregion
|
||||
|
||||
# region Prediction
|
||||
@@ -501,14 +555,9 @@ def main():
|
||||
logger.info(f"Wrote predictions to {output_test_file}!")
|
||||
# endregion
|
||||
|
||||
# region Prediction losses
|
||||
# This section is outside the scope() because it's very quick to compute, but behaves badly inside it
|
||||
if "test" in datasets and "label" in datasets["test"].features:
|
||||
print("Computing prediction loss on test labels...")
|
||||
labels = datasets["test"]["label"]
|
||||
loss = float(loss_fn(labels, predictions).numpy())
|
||||
print(f"Test loss: {loss:.4f}")
|
||||
# endregion
|
||||
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||
# If we're not pushing to hub, at least save a local copy when we're done
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -18,14 +18,14 @@ Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, C
|
||||
without using a Trainer.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from dataclasses import dataclass, field
|
||||
from functools import partial
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from datasets import ClassLabel, load_dataset
|
||||
|
||||
@@ -33,10 +33,11 @@ import evaluate
|
||||
import transformers
|
||||
from transformers import (
|
||||
CONFIG_MAPPING,
|
||||
MODEL_MAPPING,
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
DataCollatorForTokenClassification,
|
||||
HfArgumentParser,
|
||||
PushToHubCallback,
|
||||
TFAutoModelForTokenClassification,
|
||||
TFTrainingArguments,
|
||||
create_optimizer,
|
||||
@@ -48,11 +49,7 @@ from transformers.utils.versions import require_version
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.addHandler(logging.StreamHandler())
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
||||
|
||||
# You should update this to your particular problem to have better documentation of `model_type`
|
||||
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/token-classification/requirements.txt")
|
||||
|
||||
|
||||
# region Command-line arguments
|
||||
@@ -195,61 +192,6 @@ class DataTrainingArguments:
|
||||
# endregion
|
||||
|
||||
|
||||
# region Data generator
|
||||
def sample_generator(dataset, tokenizer, shuffle, pad_to_multiple_of=None):
|
||||
# Trim off the last partial batch if present
|
||||
if shuffle:
|
||||
sample_ordering = np.random.permutation(len(dataset))
|
||||
else:
|
||||
sample_ordering = np.arange(len(dataset))
|
||||
for sample_idx in sample_ordering:
|
||||
example = dataset[int(sample_idx)]
|
||||
# Handle dicts with proper padding and conversion to tensor.
|
||||
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
|
||||
if tokenizer.pad_token_id is not None:
|
||||
example["labels"][example["attention_mask"] == 0] = -100
|
||||
example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()}
|
||||
|
||||
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
|
||||
return
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
# region Helper functions
|
||||
def dataset_to_tf(dataset, tokenizer, total_batch_size, num_epochs, shuffle):
|
||||
train_generator = partial(sample_generator, dataset, tokenizer, shuffle=shuffle)
|
||||
train_signature = {
|
||||
feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
|
||||
for feature in dataset.features
|
||||
if feature != "special_tokens_mask"
|
||||
}
|
||||
# This may need to be changed depending on your particular model or tokenizer!
|
||||
padding_values = {key: tf.convert_to_tensor(0, dtype=tf.int64) for key in dataset.features}
|
||||
padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int64)
|
||||
if tokenizer.pad_token_id is not None:
|
||||
padding_values["input_ids"] = tf.convert_to_tensor(tokenizer.pad_token_id, dtype=tf.int64)
|
||||
train_signature["labels"] = train_signature["input_ids"]
|
||||
train_signature = (train_signature, train_signature["labels"])
|
||||
options = tf.data.Options()
|
||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
tf_dataset = (
|
||||
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
|
||||
.with_options(options)
|
||||
.padded_batch(
|
||||
batch_size=total_batch_size,
|
||||
drop_remainder=True,
|
||||
padding_values=(padding_values, np.array(0, dtype=np.int64)),
|
||||
)
|
||||
.repeat(int(num_epochs))
|
||||
)
|
||||
return tf_dataset
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
def main():
|
||||
# region Argument Parsing
|
||||
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
|
||||
@@ -419,6 +361,14 @@ def main():
|
||||
train_dataset = processed_raw_datasets["train"]
|
||||
eval_dataset = processed_raw_datasets["validation"]
|
||||
|
||||
if data_args.max_train_samples is not None:
|
||||
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||
train_dataset = train_dataset.select(range(max_train_samples))
|
||||
|
||||
if data_args.max_eval_samples is not None:
|
||||
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||
|
||||
# Log a few random samples from the training set:
|
||||
for index in random.sample(range(len(train_dataset)), 3):
|
||||
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
||||
@@ -439,43 +389,62 @@ def main():
|
||||
# endregion
|
||||
|
||||
# region Create TF datasets
|
||||
|
||||
# We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as
|
||||
# well as inputs.
|
||||
collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
|
||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||
train_batches_per_epoch = len(train_dataset) // total_train_batch_size
|
||||
tf_train_dataset = dataset_to_tf(
|
||||
|
||||
dataset_options = tf.data.Options()
|
||||
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
|
||||
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||
# using model.prepare_tf_dataset()
|
||||
# For more info see the docs:
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||
|
||||
tf_train_dataset = model.prepare_tf_dataset(
|
||||
train_dataset,
|
||||
tokenizer,
|
||||
total_batch_size=total_train_batch_size,
|
||||
num_epochs=training_args.num_train_epochs,
|
||||
collate_fn=collate_fn,
|
||||
batch_size=total_train_batch_size,
|
||||
shuffle=True,
|
||||
)
|
||||
).with_options(dataset_options)
|
||||
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||
eval_batches_per_epoch = len(eval_dataset) // total_eval_batch_size
|
||||
tf_eval_dataset = dataset_to_tf(
|
||||
tf_eval_dataset = model.prepare_tf_dataset(
|
||||
eval_dataset,
|
||||
tokenizer,
|
||||
total_batch_size=total_eval_batch_size,
|
||||
num_epochs=training_args.num_train_epochs,
|
||||
collate_fn=collate_fn,
|
||||
batch_size=total_eval_batch_size,
|
||||
shuffle=False,
|
||||
)
|
||||
).with_options(dataset_options)
|
||||
|
||||
# endregion
|
||||
|
||||
# region Optimizer, loss and compilation
|
||||
num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
|
||||
if training_args.warmup_steps > 0:
|
||||
num_warmup_steps = training_args.warmup_steps
|
||||
elif training_args.warmup_ratio > 0:
|
||||
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||
else:
|
||||
num_warmup_steps = 0
|
||||
|
||||
optimizer, lr_schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=int(training_args.num_train_epochs * train_batches_per_epoch),
|
||||
num_warmup_steps=training_args.warmup_steps,
|
||||
num_train_steps=num_train_steps,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
adam_beta1=training_args.adam_beta1,
|
||||
adam_beta2=training_args.adam_beta2,
|
||||
adam_epsilon=training_args.adam_epsilon,
|
||||
weight_decay_rate=training_args.weight_decay,
|
||||
adam_global_clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
|
||||
def dummy_loss(y_true, y_pred):
|
||||
return tf.reduce_mean(y_pred)
|
||||
|
||||
model.compile(loss={"loss": dummy_loss}, optimizer=optimizer)
|
||||
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
|
||||
# endregion
|
||||
|
||||
# Metrics
|
||||
@@ -517,6 +486,39 @@ def main():
|
||||
|
||||
# endregion
|
||||
|
||||
# region Preparing push_to_hub and model card
|
||||
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||
if not push_to_hub_model_id:
|
||||
if data_args.dataset_name is not None:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||
else:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-token-classification"
|
||||
|
||||
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "token-classification"}
|
||||
if data_args.dataset_name is not None:
|
||||
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||
if data_args.dataset_config_name is not None:
|
||||
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||
else:
|
||||
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||
|
||||
if training_args.push_to_hub:
|
||||
callbacks = [
|
||||
PushToHubCallback(
|
||||
output_dir=training_args.output_dir,
|
||||
model_id=push_to_hub_model_id,
|
||||
organization=training_args.push_to_hub_organization,
|
||||
token=training_args.push_to_hub_token,
|
||||
tokenizer=tokenizer,
|
||||
**model_card_kwargs,
|
||||
)
|
||||
]
|
||||
else:
|
||||
callbacks = []
|
||||
# endregion
|
||||
|
||||
# region Training
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(f" Num examples = {len(train_dataset)}")
|
||||
@@ -524,23 +526,43 @@ def main():
|
||||
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
|
||||
logger.info(f" Total train batch size = {total_train_batch_size}")
|
||||
# Only show the progress bar once on each machine.
|
||||
|
||||
model.fit(
|
||||
tf_train_dataset,
|
||||
validation_data=tf_eval_dataset,
|
||||
epochs=int(training_args.num_train_epochs),
|
||||
steps_per_epoch=train_batches_per_epoch,
|
||||
validation_steps=eval_batches_per_epoch,
|
||||
callbacks=callbacks,
|
||||
)
|
||||
# endregion
|
||||
|
||||
# region Predictions
|
||||
# For predictions, we preload the entire validation set - note that if you have a really giant validation
|
||||
# set, you might need to change this!
|
||||
eval_inputs = {key: tf.ragged.constant(eval_dataset[key]).to_tensor() for key in eval_dataset.features}
|
||||
predictions = model.predict(eval_inputs, batch_size=training_args.per_device_eval_batch_size)["logits"]
|
||||
predictions = tf.math.argmax(predictions, axis=-1)
|
||||
labels = np.array(eval_inputs["labels"])
|
||||
labels[np.array(eval_inputs["attention_mask"]) == 0] = -100
|
||||
# If you have variable batch sizes (i.e. not using pad_to_max_length), then
|
||||
# this bit might fail on TF < 2.8 because TF can't concatenate outputs of varying seq
|
||||
# length from predict().
|
||||
|
||||
try:
|
||||
predictions = model.predict(tf_eval_dataset, batch_size=training_args.per_device_eval_batch_size)["logits"]
|
||||
except tf.python.framework.errors_impl.InvalidArgumentError:
|
||||
raise ValueError(
|
||||
"Concatenating predictions failed! If your version of TensorFlow is 2.8.0 or older "
|
||||
"then you will need to use --pad_to_max_length to generate predictions, as older "
|
||||
"versions of TensorFlow cannot concatenate variable-length predictions as RaggedTensor."
|
||||
)
|
||||
if isinstance(predictions, tf.RaggedTensor):
|
||||
predictions = predictions.to_tensor(default_value=-100)
|
||||
predictions = tf.math.argmax(predictions, axis=-1).numpy()
|
||||
if "label" in eval_dataset:
|
||||
labels = eval_dataset.with_format("tf")["label"]
|
||||
else:
|
||||
labels = eval_dataset.with_format("tf")["labels"]
|
||||
if isinstance(labels, tf.RaggedTensor):
|
||||
labels = labels.to_tensor(default_value=-100)
|
||||
labels = labels.numpy()
|
||||
attention_mask = eval_dataset.with_format("tf")["attention_mask"]
|
||||
if isinstance(attention_mask, tf.RaggedTensor):
|
||||
attention_mask = attention_mask.to_tensor(default_value=-100)
|
||||
attention_mask = attention_mask.numpy()
|
||||
labels[attention_mask == 0] = -100
|
||||
preds, refs = get_labels(predictions, labels)
|
||||
metric.add_batch(
|
||||
predictions=preds,
|
||||
@@ -550,12 +572,15 @@ def main():
|
||||
logger.info("Evaluation metrics:")
|
||||
for key, val in eval_metric.items():
|
||||
logger.info(f"{key}: {val:.4f}")
|
||||
|
||||
if training_args.output_dir is not None:
|
||||
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write(json.dumps(eval_metric))
|
||||
# endregion
|
||||
|
||||
# We don't do predictions in the strategy scope because there are some issues in there right now.
|
||||
# They'll get fixed eventually, promise!
|
||||
|
||||
if training_args.output_dir is not None:
|
||||
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||
# If we're not pushing to hub, at least save a local copy when we're done
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
|
||||
|
||||
@@ -18,30 +18,32 @@ Fine-tuning the library models for translation.
|
||||
"""
|
||||
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from functools import partial
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
import evaluate
|
||||
import transformers
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
DataCollatorForSeq2Seq,
|
||||
HfArgumentParser,
|
||||
KerasMetricCallback,
|
||||
M2M100Tokenizer,
|
||||
MBart50Tokenizer,
|
||||
MBart50TokenizerFast,
|
||||
MBartTokenizer,
|
||||
MBartTokenizerFast,
|
||||
PushToHubCallback,
|
||||
TFAutoModelForSeq2SeqLM,
|
||||
TFTrainingArguments,
|
||||
create_optimizer,
|
||||
@@ -224,6 +226,16 @@ class DataTrainingArguments:
|
||||
source_prefix: Optional[str] = field(
|
||||
default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
|
||||
)
|
||||
forced_bos_token: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": (
|
||||
"The token to force as the first generated token after the :obj:`decoder_start_token_id`.Useful for"
|
||||
" multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token needs to"
|
||||
" be the target language token.(Usually it is the target language token)"
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
||||
@@ -239,70 +251,6 @@ class DataTrainingArguments:
|
||||
self.val_max_target_length = self.max_target_length
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
# region Data generator
|
||||
def sample_generator(dataset, model, tokenizer, shuffle, pad_to_multiple_of=None):
|
||||
if shuffle:
|
||||
sample_ordering = np.random.permutation(len(dataset))
|
||||
else:
|
||||
sample_ordering = np.arange(len(dataset))
|
||||
for sample_idx in sample_ordering:
|
||||
example = dataset[int(sample_idx)]
|
||||
# Handle dicts with proper padding and conversion to tensor.
|
||||
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
|
||||
example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()}
|
||||
if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
|
||||
decoder_input_ids = model.prepare_decoder_input_ids_from_labels(
|
||||
labels=tf.expand_dims(example["labels"], 0)
|
||||
)
|
||||
example["decoder_input_ids"] = tf.squeeze(decoder_input_ids, 0)
|
||||
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
|
||||
return
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
# region Helper functions
|
||||
def dataset_to_tf(dataset, model, tokenizer, total_batch_size, num_epochs, shuffle):
|
||||
if dataset is None:
|
||||
return None
|
||||
train_generator = partial(sample_generator, dataset, model, tokenizer, shuffle=shuffle)
|
||||
train_signature = {
|
||||
feature: tf.TensorSpec(shape=(None,), dtype=tf.int32)
|
||||
for feature in dataset.features
|
||||
if feature != "special_tokens_mask"
|
||||
}
|
||||
if (
|
||||
model is not None
|
||||
and "decoder_input_ids" not in train_signature
|
||||
and hasattr(model, "prepare_decoder_input_ids_from_labels")
|
||||
):
|
||||
train_signature["decoder_input_ids"] = train_signature["labels"]
|
||||
# This may need to be changed depending on your particular model or tokenizer!
|
||||
padding_values = {
|
||||
key: tf.convert_to_tensor(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0, dtype=tf.int32)
|
||||
for key in train_signature.keys()
|
||||
}
|
||||
padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int32)
|
||||
train_signature["labels"] = train_signature["input_ids"]
|
||||
train_signature = (train_signature, train_signature["labels"])
|
||||
options = tf.data.Options()
|
||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
tf_dataset = (
|
||||
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
|
||||
.with_options(options)
|
||||
.padded_batch(
|
||||
batch_size=total_batch_size,
|
||||
drop_remainder=True,
|
||||
padding_values=(padding_values, np.array(-100, dtype=np.int32)),
|
||||
)
|
||||
.repeat(int(num_epochs))
|
||||
)
|
||||
return tf_dataset
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
@@ -541,67 +489,149 @@ def main():
|
||||
# endregion
|
||||
|
||||
# region Prepare TF Dataset objects
|
||||
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
||||
data_collator = DataCollatorForSeq2Seq(
|
||||
tokenizer,
|
||||
model=model,
|
||||
label_pad_token_id=label_pad_token_id,
|
||||
pad_to_multiple_of=64, # Reduce the number of unique shapes for XLA, especially for generation
|
||||
return_tensors="tf",
|
||||
)
|
||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||
tf_train_dataset = dataset_to_tf(
|
||||
|
||||
dataset_options = tf.data.Options()
|
||||
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||
|
||||
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||
# using model.prepare_tf_dataset()
|
||||
# For more info see the docs:
|
||||
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||
|
||||
tf_train_dataset = model.prepare_tf_dataset(
|
||||
train_dataset,
|
||||
model,
|
||||
tokenizer,
|
||||
total_batch_size=total_train_batch_size,
|
||||
num_epochs=training_args.num_train_epochs,
|
||||
collate_fn=data_collator,
|
||||
batch_size=total_train_batch_size,
|
||||
shuffle=True,
|
||||
)
|
||||
tf_eval_dataset = dataset_to_tf(
|
||||
eval_dataset,
|
||||
model,
|
||||
tokenizer,
|
||||
total_eval_batch_size,
|
||||
num_epochs=1,
|
||||
shuffle=False,
|
||||
)
|
||||
).with_options(dataset_options)
|
||||
tf_eval_dataset = model.prepare_tf_dataset(
|
||||
eval_dataset, collate_fn=data_collator, batch_size=total_eval_batch_size, shuffle=False
|
||||
).with_options(dataset_options)
|
||||
# endregion
|
||||
|
||||
# region Optimizer, loss and LR scheduling
|
||||
# Scheduler and math around the number of training steps.
|
||||
num_update_steps_per_epoch = len(train_dataset) // training_args.per_device_train_batch_size
|
||||
num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch
|
||||
optimizer, lr_schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=num_train_steps,
|
||||
num_warmup_steps=training_args.warmup_steps,
|
||||
)
|
||||
|
||||
def masked_sparse_categorical_crossentropy(y_true, y_pred):
|
||||
# We clip the negative labels to 0 to avoid NaNs appearing in the output and
|
||||
# fouling up everything that comes afterwards. The loss values corresponding to clipped values
|
||||
# will be masked later anyway, but even masked NaNs seem to cause overflows for some reason.
|
||||
# 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely
|
||||
# event that you have more than 1 million tokens in your vocabulary, consider increasing this value.
|
||||
# More pragmatically, consider redesigning your tokenizer.
|
||||
losses = tf.keras.losses.sparse_categorical_crossentropy(
|
||||
tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True
|
||||
# region Optimizer and LR scheduling
|
||||
num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
|
||||
if training_args.warmup_steps > 0:
|
||||
num_warmup_steps = training_args.warmup_steps
|
||||
elif training_args.warmup_ratio > 0:
|
||||
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||
else:
|
||||
num_warmup_steps = 0
|
||||
if training_args.do_train:
|
||||
optimizer, lr_schedule = create_optimizer(
|
||||
init_lr=training_args.learning_rate,
|
||||
num_train_steps=num_train_steps,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
adam_beta1=training_args.adam_beta1,
|
||||
adam_beta2=training_args.adam_beta2,
|
||||
adam_epsilon=training_args.adam_epsilon,
|
||||
weight_decay_rate=training_args.weight_decay,
|
||||
adam_global_clipnorm=training_args.max_grad_norm,
|
||||
)
|
||||
# Compute the per-sample loss only over the unmasked tokens
|
||||
losses = tf.ragged.boolean_mask(losses, y_true != -100)
|
||||
losses = tf.reduce_mean(losses, axis=-1)
|
||||
return losses
|
||||
|
||||
else:
|
||||
optimizer = None
|
||||
# endregion
|
||||
|
||||
# region Metric and postprocessing
|
||||
metric = evaluate.load("sacrebleu")
|
||||
if training_args.do_eval:
|
||||
metric = evaluate.load("sacrebleu")
|
||||
|
||||
def postprocess_text(preds, labels):
|
||||
preds = [pred.strip() for pred in preds]
|
||||
labels = [[label.strip()] for label in labels]
|
||||
if data_args.val_max_target_length is None:
|
||||
data_args.val_max_target_length = data_args.max_target_length
|
||||
|
||||
return preds, labels
|
||||
gen_kwargs = {
|
||||
"max_length": data_args.val_max_target_length,
|
||||
"num_beams": data_args.num_beams,
|
||||
"no_repeat_ngram_size": 0, # Not supported under XLA right now, and some models set it by default
|
||||
}
|
||||
|
||||
def postprocess_text(preds, labels):
|
||||
preds = [pred.strip() for pred in preds]
|
||||
labels = [[label.strip()] for label in labels]
|
||||
|
||||
return preds, labels
|
||||
|
||||
def compute_metrics(preds):
|
||||
predictions, labels = preds
|
||||
if isinstance(predictions, tuple):
|
||||
predictions = predictions[0]
|
||||
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
|
||||
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
|
||||
metrics = metric.compute(predictions=decoded_preds, references=decoded_labels)
|
||||
return {"bleu": metrics["score"]}
|
||||
|
||||
# The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics
|
||||
# to be computed each epoch. Any Python code can be included in the metric_fn. This is especially
|
||||
# useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs.
|
||||
# For more information, see the docs at
|
||||
# https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback
|
||||
|
||||
metric_callback = KerasMetricCallback(
|
||||
metric_fn=compute_metrics,
|
||||
eval_dataset=tf_eval_dataset,
|
||||
predict_with_generate=True,
|
||||
use_xla_generation=True,
|
||||
generate_kwargs=gen_kwargs,
|
||||
)
|
||||
callbacks = [metric_callback]
|
||||
else:
|
||||
callbacks = []
|
||||
|
||||
# endregion
|
||||
|
||||
# region Preparing push_to_hub and model card
|
||||
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||
if not push_to_hub_model_id:
|
||||
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.source_lang}-{data_args.target_lang}"
|
||||
|
||||
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "translation"}
|
||||
if data_args.dataset_name is not None:
|
||||
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||
if data_args.dataset_config_name is not None:
|
||||
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||
else:
|
||||
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||
|
||||
languages = [l for l in [data_args.source_lang, data_args.target_lang] if l is not None]
|
||||
if len(languages) > 0:
|
||||
model_card_kwargs["language"] = languages
|
||||
|
||||
if training_args.push_to_hub:
|
||||
# Because this training can be quite long, we save once per epoch.
|
||||
callbacks.append(
|
||||
PushToHubCallback(
|
||||
output_dir=training_args.output_dir,
|
||||
model_id=push_to_hub_model_id,
|
||||
organization=training_args.push_to_hub_organization,
|
||||
token=training_args.push_to_hub_token,
|
||||
tokenizer=tokenizer,
|
||||
**model_card_kwargs,
|
||||
)
|
||||
)
|
||||
# endregion
|
||||
|
||||
# region Training
|
||||
model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer)
|
||||
eval_metrics = None
|
||||
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
|
||||
|
||||
if training_args.do_train:
|
||||
logger.info("***** Running training *****")
|
||||
@@ -611,41 +641,48 @@ def main():
|
||||
logger.info(f" Total train batch size = {total_train_batch_size}")
|
||||
logger.info(f" Total optimization steps = {num_train_steps}")
|
||||
|
||||
model.fit(
|
||||
tf_train_dataset,
|
||||
epochs=int(training_args.num_train_epochs),
|
||||
steps_per_epoch=num_update_steps_per_epoch,
|
||||
)
|
||||
if training_args.xla and not data_args.pad_to_max_length:
|
||||
logger.warning(
|
||||
"XLA training may be slow at first when --pad_to_max_length is not set "
|
||||
"until all possible shapes have been compiled."
|
||||
)
|
||||
|
||||
history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
|
||||
eval_metrics = {key: val[-1] for key, val in history.history.items()}
|
||||
# endregion
|
||||
|
||||
# region Validation
|
||||
if data_args.val_max_target_length is None:
|
||||
data_args.val_max_target_length = data_args.max_target_length
|
||||
if training_args.do_eval and not training_args.do_train:
|
||||
# Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate
|
||||
@tf.function(jit_compile=True)
|
||||
def generate(**kwargs):
|
||||
return model.generate(**kwargs)
|
||||
|
||||
gen_kwargs = {
|
||||
"max_length": data_args.val_max_target_length,
|
||||
"num_beams": data_args.num_beams,
|
||||
}
|
||||
if training_args.do_eval:
|
||||
logger.info("Evaluation...")
|
||||
for batch, labels in tqdm(
|
||||
tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size
|
||||
):
|
||||
batch.update(gen_kwargs)
|
||||
generated_tokens = model.generate(**batch)
|
||||
if isinstance(generated_tokens, tuple):
|
||||
generated_tokens = generated_tokens[0]
|
||||
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
||||
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
|
||||
if training_args.do_eval:
|
||||
logger.info("Evaluation...")
|
||||
for batch, labels in tf_eval_dataset:
|
||||
batch.update(gen_kwargs)
|
||||
generated_tokens = generate(**batch)
|
||||
if isinstance(generated_tokens, tuple):
|
||||
generated_tokens = generated_tokens[0]
|
||||
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
||||
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
|
||||
|
||||
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
|
||||
eval_metric = metric.compute()
|
||||
logger.info({"bleu": eval_metric["score"]})
|
||||
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
|
||||
|
||||
eval_metrics = metric.compute()
|
||||
logger.info({"bleu": eval_metrics["score"]})
|
||||
# endregion
|
||||
|
||||
if training_args.output_dir is not None:
|
||||
if training_args.output_dir is not None and eval_metrics is not None:
|
||||
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
writer.write(json.dumps(eval_metrics))
|
||||
|
||||
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||
# If we're not pushing to hub, at least save a local copy when we're done
|
||||
model.save_pretrained(training_args.output_dir)
|
||||
|
||||
|
||||
|
||||
@@ -87,6 +87,8 @@ def create_optimizer(
|
||||
adam_beta1: float = 0.9,
|
||||
adam_beta2: float = 0.999,
|
||||
adam_epsilon: float = 1e-8,
|
||||
adam_clipnorm: Optional[float] = None,
|
||||
adam_global_clipnorm: Optional[float] = None,
|
||||
weight_decay_rate: float = 0.0,
|
||||
power: float = 1.0,
|
||||
include_in_weight_decay: Optional[List[str]] = None,
|
||||
@@ -109,6 +111,11 @@ def create_optimizer(
|
||||
The beta2 to use in Adam.
|
||||
adam_epsilon (`float`, *optional*, defaults to 1e-8):
|
||||
The epsilon to use in Adam.
|
||||
adam_clipnorm: (`float`, *optional*, defaults to `None`):
|
||||
If not `None`, clip the gradient norm for each weight tensor to this value.
|
||||
adam_global_clipnorm: (`float`, *optional*, defaults to `None`)
|
||||
If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
|
||||
weight tensors, as if they were concatenated into a single vector.
|
||||
weight_decay_rate (`float`, *optional*, defaults to 0):
|
||||
The weight decay to use.
|
||||
power (`float`, *optional*, defaults to 1.0):
|
||||
@@ -137,12 +144,19 @@ def create_optimizer(
|
||||
beta_1=adam_beta1,
|
||||
beta_2=adam_beta2,
|
||||
epsilon=adam_epsilon,
|
||||
clipnorm=adam_clipnorm,
|
||||
global_clipnorm=adam_global_clipnorm,
|
||||
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
|
||||
include_in_weight_decay=include_in_weight_decay,
|
||||
)
|
||||
else:
|
||||
optimizer = tf.keras.optimizers.Adam(
|
||||
learning_rate=lr_schedule, beta_1=adam_beta1, beta_2=adam_beta2, epsilon=adam_epsilon
|
||||
learning_rate=lr_schedule,
|
||||
beta_1=adam_beta1,
|
||||
beta_2=adam_beta2,
|
||||
epsilon=adam_epsilon,
|
||||
clipnorm=adam_clipnorm,
|
||||
global_clipnorm=adam_global_clipnorm,
|
||||
)
|
||||
# We return the optimizer and the LR scheduler in order to better track the
|
||||
# evolution of the LR independently of the optimizer.
|
||||
|
||||
@@ -106,6 +106,7 @@ class OptimizerNames(ExplicitEnum):
|
||||
|
||||
@dataclass
|
||||
class TrainingArguments:
|
||||
framework = "pt"
|
||||
"""
|
||||
TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
|
||||
itself**.
|
||||
@@ -1039,25 +1040,25 @@ class TrainingArguments:
|
||||
self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
|
||||
if self.run_name is None:
|
||||
self.run_name = self.output_dir
|
||||
|
||||
if self.fp16_backend and self.fp16_backend != "auto":
|
||||
warnings.warn(
|
||||
"`fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
|
||||
" `half_precision_backend` instead",
|
||||
FutureWarning,
|
||||
)
|
||||
self.half_precision_backend = self.fp16_backend
|
||||
|
||||
if self.bf16 or self.bf16_full_eval:
|
||||
|
||||
if self.no_cuda and not is_torch_bf16_cpu_available():
|
||||
# cpu
|
||||
raise ValueError("Your setup doesn't support bf16/cpu. You need torch>=1.10")
|
||||
elif not self.no_cuda and not is_torch_bf16_gpu_available():
|
||||
# gpu
|
||||
raise ValueError(
|
||||
"Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
|
||||
if self.framework == "pt" and is_torch_available():
|
||||
if self.fp16_backend and self.fp16_backend != "auto":
|
||||
warnings.warn(
|
||||
"`fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
|
||||
" `half_precision_backend` instead",
|
||||
FutureWarning,
|
||||
)
|
||||
self.half_precision_backend = self.fp16_backend
|
||||
|
||||
if self.bf16 or self.bf16_full_eval:
|
||||
|
||||
if self.no_cuda and not is_torch_bf16_cpu_available():
|
||||
# cpu
|
||||
raise ValueError("Your setup doesn't support bf16/cpu. You need torch>=1.10")
|
||||
elif not self.no_cuda and not is_torch_bf16_gpu_available():
|
||||
# gpu
|
||||
raise ValueError(
|
||||
"Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
|
||||
)
|
||||
|
||||
if self.fp16 and self.bf16:
|
||||
raise ValueError("At most one of fp16 and bf16 can be True, but not both")
|
||||
@@ -1084,7 +1085,8 @@ class TrainingArguments:
|
||||
self.optim = OptimizerNames.ADAFACTOR
|
||||
|
||||
if (
|
||||
is_torch_available()
|
||||
self.framework == "pt"
|
||||
and is_torch_available()
|
||||
and (self.device.type != "cuda")
|
||||
and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ)
|
||||
and (self.fp16 or self.fp16_full_eval)
|
||||
@@ -1095,7 +1097,8 @@ class TrainingArguments:
|
||||
)
|
||||
|
||||
if (
|
||||
is_torch_available()
|
||||
self.framework == "pt"
|
||||
and is_torch_available()
|
||||
and (self.device.type != "cuda")
|
||||
and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ)
|
||||
and (self.device.type != "cpu")
|
||||
@@ -1106,7 +1109,7 @@ class TrainingArguments:
|
||||
" (`--bf16_full_eval`) can only be used on CUDA or CPU devices."
|
||||
)
|
||||
|
||||
if is_torch_available() and self.tf32 is not None:
|
||||
if self.framework == "pt" and is_torch_available() and self.tf32 is not None:
|
||||
if self.tf32:
|
||||
if is_torch_tf32_available():
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
|
||||
@@ -28,6 +28,7 @@ if is_tf_available():
|
||||
|
||||
@dataclass
|
||||
class TFTrainingArguments(TrainingArguments):
|
||||
framework = "tf"
|
||||
"""
|
||||
TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
|
||||
itself**.
|
||||
@@ -188,9 +189,6 @@ class TFTrainingArguments(TrainingArguments):
|
||||
def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", int]:
|
||||
logger.info("Tensorflow: setting up strategy")
|
||||
|
||||
if self.xla:
|
||||
tf.config.optimizer.set_jit(True)
|
||||
|
||||
gpus = tf.config.list_physical_devices("GPU")
|
||||
|
||||
# Set to float16 at first
|
||||
|
||||
Reference in New Issue
Block a user