TF Examples Rewrite (#18451)
* Finished QA example * Dodge a merge conflict * Update text classification and LM examples * Update NER example * New Keras metrics WIP, fix NER example * Update NER example * Update MC, summarization and translation examples * Add XLA warnings when shapes are variable * Make sure batch_size is consistently scaled by num_replicas * Add PushToHubCallback to all models * Add docs links for KerasMetricCallback * Add docs links for prepare_tf_dataset and jit_compile * Correct inferred model names * Don't assume the dataset has 'lang' * Don't assume the dataset has 'lang' * Write metrics in text classification * Add 'framework' to TrainingArguments and TFTrainingArguments * Export metrics in all examples and add tests * Fix training args for Flax * Update command line args for translation test * make fixup * Fix accidentally running other tests in fp16 * Remove do_train/do_eval from run_clm.py * Remove do_train/do_eval from run_mlm.py * Add tensorflow tests to circleci * Fix circleci * Update examples/tensorflow/language-modeling/run_mlm.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/test_tensorflow_examples.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/translation/run_translation.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Update examples/tensorflow/token-classification/run_ner.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Fix save path for tests * Fix some model card kwargs * Explain the magical -1000 * Actually enable tests this time * Skip text classification PR until we fix shape inference * make fixup Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
@@ -658,6 +658,71 @@ jobs:
|
|||||||
- store_artifacts:
|
- store_artifacts:
|
||||||
path: ~/transformers/reports
|
path: ~/transformers/reports
|
||||||
|
|
||||||
|
run_examples_tensorflow:
|
||||||
|
working_directory: ~/transformers
|
||||||
|
docker:
|
||||||
|
- image: cimg/python:3.7.12
|
||||||
|
environment:
|
||||||
|
OMP_NUM_THREADS: 1
|
||||||
|
TRANSFORMERS_IS_CI: yes
|
||||||
|
PYTEST_TIMEOUT: 120
|
||||||
|
resource_class: xlarge
|
||||||
|
parallelism: 1
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- restore_cache:
|
||||||
|
keys:
|
||||||
|
- v0.5-tensorflow_examples-{{ checksum "setup.py" }}
|
||||||
|
- v0.5-{{ checksum "setup.py" }}
|
||||||
|
- run: pip install --upgrade pip
|
||||||
|
- run: pip install .[sklearn,tensorflow,sentencepiece,testing]
|
||||||
|
- run: pip install -r examples/tensorflow/_tests_requirements.txt
|
||||||
|
- save_cache:
|
||||||
|
key: v0.5-tensorflow_examples-{{ checksum "setup.py" }}
|
||||||
|
paths:
|
||||||
|
- '~/.cache/pip'
|
||||||
|
- run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt
|
||||||
|
- store_artifacts:
|
||||||
|
path: ~/transformers/test_preparation.txt
|
||||||
|
- run: |
|
||||||
|
if [ -f test_list.txt ]; then
|
||||||
|
python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee tests_output.txt
|
||||||
|
fi
|
||||||
|
- store_artifacts:
|
||||||
|
path: ~/transformers/tensorflow_examples_output.txt
|
||||||
|
- store_artifacts:
|
||||||
|
path: ~/transformers/reports
|
||||||
|
|
||||||
|
run_examples_tensorflow_all:
|
||||||
|
working_directory: ~/transformers
|
||||||
|
docker:
|
||||||
|
- image: cimg/python:3.7.12
|
||||||
|
environment:
|
||||||
|
OMP_NUM_THREADS: 1
|
||||||
|
TRANSFORMERS_IS_CI: yes
|
||||||
|
PYTEST_TIMEOUT: 120
|
||||||
|
resource_class: xlarge
|
||||||
|
parallelism: 1
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- restore_cache:
|
||||||
|
keys:
|
||||||
|
- v0.5-tensorflow_examples-{{ checksum "setup.py" }}
|
||||||
|
- v0.5-{{ checksum "setup.py" }}
|
||||||
|
- run: pip install --upgrade pip
|
||||||
|
- run: pip install .[sklearn,tensorflow,sentencepiece,testing]
|
||||||
|
- run: pip install -r examples/tensorflow/_tests_requirements.txt
|
||||||
|
- save_cache:
|
||||||
|
key: v0.5-tensorflow_examples-{{ checksum "setup.py" }}
|
||||||
|
paths:
|
||||||
|
- '~/.cache/pip'
|
||||||
|
- run: |
|
||||||
|
TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee examples_output.txt
|
||||||
|
- store_artifacts:
|
||||||
|
path: ~/transformers/tensorflow_examples_output.txt
|
||||||
|
- store_artifacts:
|
||||||
|
path: ~/transformers/reports
|
||||||
|
|
||||||
run_examples_flax:
|
run_examples_flax:
|
||||||
working_directory: ~/transformers
|
working_directory: ~/transformers
|
||||||
docker:
|
docker:
|
||||||
@@ -1000,6 +1065,7 @@ workflows:
|
|||||||
- check_code_quality
|
- check_code_quality
|
||||||
- check_repository_consistency
|
- check_repository_consistency
|
||||||
- run_examples_torch
|
- run_examples_torch
|
||||||
|
- run_examples_tensorflow
|
||||||
- run_examples_flax
|
- run_examples_flax
|
||||||
- run_tests_custom_tokenizers
|
- run_tests_custom_tokenizers
|
||||||
- run_tests_torch_and_tf
|
- run_tests_torch_and_tf
|
||||||
@@ -1022,6 +1088,7 @@ workflows:
|
|||||||
- main
|
- main
|
||||||
jobs:
|
jobs:
|
||||||
- run_examples_torch_all
|
- run_examples_torch_all
|
||||||
|
- run_examples_tensorflow_all
|
||||||
- run_examples_flax_all
|
- run_examples_flax_all
|
||||||
- run_tests_torch_and_tf_all
|
- run_tests_torch_and_tf_all
|
||||||
- run_tests_torch_and_flax_all
|
- run_tests_torch_and_flax_all
|
||||||
|
|||||||
25
examples/tensorflow/_tests_requirements.txt
Normal file
25
examples/tensorflow/_tests_requirements.txt
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
tensorflow
|
||||||
|
tensorboard
|
||||||
|
scikit-learn
|
||||||
|
seqeval
|
||||||
|
psutil
|
||||||
|
sacrebleu >= 1.4.12
|
||||||
|
git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||||
|
rouge-score
|
||||||
|
tensorflow_datasets
|
||||||
|
matplotlib
|
||||||
|
git-python==1.0.3
|
||||||
|
faiss-cpu
|
||||||
|
streamlit
|
||||||
|
elasticsearch
|
||||||
|
nltk
|
||||||
|
pandas
|
||||||
|
datasets >= 1.13.3
|
||||||
|
fire
|
||||||
|
pytest
|
||||||
|
conllu
|
||||||
|
sentencepiece != 0.1.92
|
||||||
|
protobuf
|
||||||
|
jiwer
|
||||||
|
librosa
|
||||||
|
evaluate >= 0.2.0
|
||||||
@@ -22,6 +22,8 @@ https://huggingface.co/models?filter=text-generation
|
|||||||
"""
|
"""
|
||||||
# You can also adapt this script on your own clm task. Pointers for this are left as comments.
|
# You can also adapt this script on your own clm task. Pointers for this are left as comments.
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
# region Imports
|
# region Imports
|
||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
@@ -46,8 +48,8 @@ from transformers import (
|
|||||||
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
|
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
DefaultDataCollator,
|
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
|
PushToHubCallback,
|
||||||
TFAutoModelForCausalLM,
|
TFAutoModelForCausalLM,
|
||||||
TFTrainingArguments,
|
TFTrainingArguments,
|
||||||
create_optimizer,
|
create_optimizer,
|
||||||
@@ -205,21 +207,6 @@ class DataTrainingArguments:
|
|||||||
assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
|
assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
# region Helper classes
|
|
||||||
class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
|
||||||
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
|
|
||||||
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
|
|
||||||
# that saves the model with this method after each epoch.
|
|
||||||
def __init__(self, output_dir, **kwargs):
|
|
||||||
super().__init__()
|
|
||||||
self.output_dir = output_dir
|
|
||||||
|
|
||||||
def on_epoch_end(self, epoch, logs=None):
|
|
||||||
self.model.save_pretrained(self.output_dir)
|
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
|
||||||
@@ -299,6 +286,7 @@ def main():
|
|||||||
raw_datasets = load_dataset(
|
raw_datasets = load_dataset(
|
||||||
data_args.dataset_name,
|
data_args.dataset_name,
|
||||||
data_args.dataset_config_name,
|
data_args.dataset_config_name,
|
||||||
|
cache_dir=model_args.cache_dir,
|
||||||
use_auth_token=True if model_args.use_auth_token else None,
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
)
|
)
|
||||||
if "validation" not in raw_datasets.keys():
|
if "validation" not in raw_datasets.keys():
|
||||||
@@ -306,12 +294,14 @@ def main():
|
|||||||
data_args.dataset_name,
|
data_args.dataset_name,
|
||||||
data_args.dataset_config_name,
|
data_args.dataset_config_name,
|
||||||
split=f"train[:{data_args.validation_split_percentage}%]",
|
split=f"train[:{data_args.validation_split_percentage}%]",
|
||||||
|
cache_dir=model_args.cache_dir,
|
||||||
use_auth_token=True if model_args.use_auth_token else None,
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
)
|
)
|
||||||
raw_datasets["train"] = load_dataset(
|
raw_datasets["train"] = load_dataset(
|
||||||
data_args.dataset_name,
|
data_args.dataset_name,
|
||||||
data_args.dataset_config_name,
|
data_args.dataset_config_name,
|
||||||
split=f"train[{data_args.validation_split_percentage}%:]",
|
split=f"train[{data_args.validation_split_percentage}%:]",
|
||||||
|
cache_dir=model_args.cache_dir,
|
||||||
use_auth_token=True if model_args.use_auth_token else None,
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@@ -321,16 +311,39 @@ def main():
|
|||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = (
|
||||||
|
data_args.train_file.split(".")[-1]
|
||||||
|
if data_args.train_file is not None
|
||||||
|
else data_args.validation_file.split(".")[-1]
|
||||||
|
)
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
|
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
|
||||||
raw_datasets = load_dataset(
|
raw_datasets = load_dataset(
|
||||||
extension,
|
extension,
|
||||||
data_files=data_files,
|
data_files=data_files,
|
||||||
|
cache_dir=model_args.cache_dir,
|
||||||
use_auth_token=True if model_args.use_auth_token else None,
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
**dataset_args,
|
**dataset_args,
|
||||||
)
|
)
|
||||||
|
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
|
||||||
|
if "validation" not in raw_datasets.keys():
|
||||||
|
raw_datasets["validation"] = load_dataset(
|
||||||
|
extension,
|
||||||
|
data_files=data_files,
|
||||||
|
split=f"train[:{data_args.validation_split_percentage}%]",
|
||||||
|
cache_dir=model_args.cache_dir,
|
||||||
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
|
**dataset_args,
|
||||||
|
)
|
||||||
|
raw_datasets["train"] = load_dataset(
|
||||||
|
extension,
|
||||||
|
data_files=data_files,
|
||||||
|
split=f"train[{data_args.validation_split_percentage}%:]",
|
||||||
|
cache_dir=model_args.cache_dir,
|
||||||
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
|
**dataset_args,
|
||||||
|
)
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||||
# endregion
|
# endregion
|
||||||
@@ -446,7 +459,7 @@ def main():
|
|||||||
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
# Log a few random samples from the training set:
|
# Log a few random samples from the training set:
|
||||||
for index in random.sample(range(len(train_dataset)), 3):
|
for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
|
||||||
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
@@ -465,44 +478,88 @@ def main():
|
|||||||
|
|
||||||
# region TF Dataset preparation
|
# region TF Dataset preparation
|
||||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||||
data_collator = DefaultDataCollator(return_tensors="tf")
|
|
||||||
options = tf.data.Options()
|
options = tf.data.Options()
|
||||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||||
|
|
||||||
tf_train_dataset = train_dataset.to_tf_dataset(
|
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||||
# labels are passed as input, as we will use the model's internal loss
|
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||||
columns=[col for col in train_dataset.features if col != "special_tokens_mask"],
|
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||||
|
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||||
|
# using model.prepare_tf_dataset()
|
||||||
|
# For more info see the docs:
|
||||||
|
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||||
|
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||||
|
|
||||||
|
tf_train_dataset = model.prepare_tf_dataset(
|
||||||
|
train_dataset,
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
||||||
collate_fn=data_collator,
|
|
||||||
drop_remainder=True,
|
|
||||||
).with_options(options)
|
).with_options(options)
|
||||||
|
|
||||||
tf_eval_dataset = eval_dataset.to_tf_dataset(
|
tf_eval_dataset = model.prepare_tf_dataset(
|
||||||
# labels are passed as input, as we will use the model's internal loss
|
eval_dataset,
|
||||||
columns=[col for col in eval_dataset.features if col != "special_tokens_mask"],
|
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
batch_size=num_replicas * training_args.per_device_eval_batch_size,
|
||||||
collate_fn=data_collator,
|
|
||||||
drop_remainder=True,
|
drop_remainder=True,
|
||||||
).with_options(options)
|
).with_options(options)
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Optimizer and loss
|
# region Optimizer and loss
|
||||||
batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
|
num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
|
||||||
|
if training_args.warmup_steps > 0:
|
||||||
|
num_warmup_steps = training_args.warmup_steps
|
||||||
|
elif training_args.warmup_ratio > 0:
|
||||||
|
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||||
|
else:
|
||||||
|
num_warmup_steps = 0
|
||||||
|
|
||||||
# Bias and layernorm weights are automatically excluded from the decay
|
# Bias and layernorm weights are automatically excluded from the decay
|
||||||
optimizer, lr_schedule = create_optimizer(
|
optimizer, lr_schedule = create_optimizer(
|
||||||
init_lr=training_args.learning_rate,
|
init_lr=training_args.learning_rate,
|
||||||
num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
|
num_train_steps=num_train_steps,
|
||||||
num_warmup_steps=training_args.warmup_steps,
|
num_warmup_steps=num_warmup_steps,
|
||||||
adam_beta1=training_args.adam_beta1,
|
adam_beta1=training_args.adam_beta1,
|
||||||
adam_beta2=training_args.adam_beta2,
|
adam_beta2=training_args.adam_beta2,
|
||||||
adam_epsilon=training_args.adam_epsilon,
|
adam_epsilon=training_args.adam_epsilon,
|
||||||
weight_decay_rate=training_args.weight_decay,
|
weight_decay_rate=training_args.weight_decay,
|
||||||
|
adam_global_clipnorm=training_args.max_grad_norm,
|
||||||
)
|
)
|
||||||
|
|
||||||
# no user-specified loss = will use the model internal loss
|
# no user-specified loss = will use the model internal loss
|
||||||
model.compile(optimizer=optimizer)
|
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
|
||||||
|
# endregion
|
||||||
|
|
||||||
|
# region Preparing push_to_hub and model card
|
||||||
|
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||||
|
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||||
|
if not push_to_hub_model_id:
|
||||||
|
if data_args.dataset_name is not None:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||||
|
else:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-clm"
|
||||||
|
|
||||||
|
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
|
||||||
|
if data_args.dataset_name is not None:
|
||||||
|
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||||
|
if data_args.dataset_config_name is not None:
|
||||||
|
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||||
|
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||||
|
else:
|
||||||
|
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||||
|
|
||||||
|
if training_args.push_to_hub:
|
||||||
|
callbacks = [
|
||||||
|
PushToHubCallback(
|
||||||
|
output_dir=training_args.output_dir,
|
||||||
|
model_id=push_to_hub_model_id,
|
||||||
|
organization=training_args.push_to_hub_organization,
|
||||||
|
token=training_args.push_to_hub_token,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
**model_card_kwargs,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
callbacks = []
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Training and validation
|
# region Training and validation
|
||||||
@@ -512,33 +569,45 @@ def main():
|
|||||||
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
|
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
|
||||||
logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
|
logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
|
||||||
|
|
||||||
|
# For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
|
||||||
|
# to the Hugging Face Hub rather than just pushing the finished model.
|
||||||
|
# See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
|
||||||
|
|
||||||
history = model.fit(
|
history = model.fit(
|
||||||
tf_train_dataset,
|
tf_train_dataset,
|
||||||
validation_data=tf_eval_dataset,
|
validation_data=tf_eval_dataset,
|
||||||
epochs=int(training_args.num_train_epochs),
|
epochs=int(training_args.num_train_epochs),
|
||||||
steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
|
callbacks=callbacks,
|
||||||
callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
|
|
||||||
)
|
)
|
||||||
|
train_loss = history.history["loss"][-1]
|
||||||
try:
|
try:
|
||||||
train_perplexity = math.exp(history.history["loss"][-1])
|
train_perplexity = math.exp(train_loss)
|
||||||
except OverflowError:
|
except OverflowError:
|
||||||
train_perplexity = math.inf
|
train_perplexity = math.inf
|
||||||
|
logger.info(f" Final train loss: {train_loss:.3f}")
|
||||||
|
logger.info(f" Final train perplexity: {train_perplexity:.3f}")
|
||||||
|
validation_loss = history.history["val_loss"][-1]
|
||||||
try:
|
try:
|
||||||
validation_perplexity = math.exp(history.history["val_loss"][-1])
|
validation_perplexity = math.exp(validation_loss)
|
||||||
except OverflowError:
|
except OverflowError:
|
||||||
validation_perplexity = math.inf
|
validation_perplexity = math.inf
|
||||||
logger.info(f" Final train loss: {history.history['loss'][-1]:.3f}")
|
logger.info(f" Final validation loss: {validation_loss:.3f}")
|
||||||
logger.info(f" Final train perplexity: {train_perplexity:.3f}")
|
|
||||||
logger.info(f" Final validation loss: {history.history['val_loss'][-1]:.3f}")
|
|
||||||
logger.info(f" Final validation perplexity: {validation_perplexity:.3f}")
|
logger.info(f" Final validation perplexity: {validation_perplexity:.3f}")
|
||||||
# endregion
|
|
||||||
|
|
||||||
if training_args.output_dir is not None:
|
if training_args.output_dir is not None:
|
||||||
model.save_pretrained(training_args.output_dir)
|
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||||
|
results_dict = dict()
|
||||||
|
results_dict["train_loss"] = train_loss
|
||||||
|
results_dict["train_perplexity"] = train_perplexity
|
||||||
|
results_dict["eval_loss"] = validation_loss
|
||||||
|
results_dict["eval_perplexity"] = validation_perplexity
|
||||||
|
with open(output_eval_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(results_dict))
|
||||||
|
# endregion
|
||||||
|
|
||||||
if training_args.push_to_hub:
|
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||||
# You'll probably want to include some of your own metadata here!
|
# If we're not pushing to hub, at least save a local copy when we're done
|
||||||
model.push_to_hub()
|
model.save_pretrained(training_args.output_dir)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -22,9 +22,7 @@ https://huggingface.co/models?filter=fill-mask
|
|||||||
"""
|
"""
|
||||||
# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
|
# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
|
||||||
|
|
||||||
# TODO Do multi-GPU and TPU tests and make sure the dataset length works as expected
|
import json
|
||||||
# TODO Duplicate all changes over to the CLM script
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
@@ -50,6 +48,7 @@ from transformers import (
|
|||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
DataCollatorForLanguageModeling,
|
DataCollatorForLanguageModeling,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
|
PushToHubCallback,
|
||||||
TFAutoModelForMaskedLM,
|
TFAutoModelForMaskedLM,
|
||||||
TFTrainingArguments,
|
TFTrainingArguments,
|
||||||
create_optimizer,
|
create_optimizer,
|
||||||
@@ -217,22 +216,6 @@ class DataTrainingArguments:
|
|||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
|
||||||
# region Helper classes
|
|
||||||
class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
|
||||||
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
|
|
||||||
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
|
|
||||||
# that saves the model with this method after each epoch.
|
|
||||||
def __init__(self, output_dir, **kwargs):
|
|
||||||
super().__init__()
|
|
||||||
self.output_dir = output_dir
|
|
||||||
|
|
||||||
def on_epoch_end(self, epoch, logs=None):
|
|
||||||
self.model.save_pretrained(self.output_dir)
|
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# region Argument Parsing
|
# region Argument Parsing
|
||||||
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
|
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
|
||||||
@@ -492,7 +475,7 @@ def main():
|
|||||||
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
# Log a few random samples from the training set:
|
# Log a few random samples from the training set:
|
||||||
for index in random.sample(range(len(train_dataset)), 3):
|
for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))):
|
||||||
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
@@ -517,40 +500,88 @@ def main():
|
|||||||
options = tf.data.Options()
|
options = tf.data.Options()
|
||||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||||
|
|
||||||
tf_train_dataset = train_dataset.to_tf_dataset(
|
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||||
# labels are passed as input, as we will use the model's internal loss
|
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||||
columns=[col for col in train_dataset.features if col != "special_tokens_mask"] + ["labels"],
|
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||||
|
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||||
|
# using model.prepare_tf_dataset()
|
||||||
|
# For more info see the docs:
|
||||||
|
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||||
|
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||||
|
|
||||||
|
tf_train_dataset = model.prepare_tf_dataset(
|
||||||
|
train_dataset,
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
||||||
collate_fn=data_collator,
|
collate_fn=data_collator,
|
||||||
drop_remainder=True,
|
|
||||||
).with_options(options)
|
).with_options(options)
|
||||||
|
|
||||||
tf_eval_dataset = eval_dataset.to_tf_dataset(
|
tf_eval_dataset = model.prepare_tf_dataset(
|
||||||
|
eval_dataset,
|
||||||
# labels are passed as input, as we will use the model's internal loss
|
# labels are passed as input, as we will use the model's internal loss
|
||||||
columns=[col for col in eval_dataset.features if col != "special_tokens_mask"] + ["labels"],
|
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
batch_size=num_replicas * training_args.per_device_train_batch_size,
|
batch_size=num_replicas * training_args.per_device_eval_batch_size,
|
||||||
collate_fn=data_collator,
|
collate_fn=data_collator,
|
||||||
drop_remainder=True,
|
drop_remainder=True,
|
||||||
).with_options(options)
|
).with_options(options)
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Optimizer and loss
|
# region Optimizer and loss
|
||||||
batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
|
num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs)
|
||||||
|
if training_args.warmup_steps > 0:
|
||||||
|
num_warmup_steps = training_args.warmup_steps
|
||||||
|
elif training_args.warmup_ratio > 0:
|
||||||
|
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||||
|
else:
|
||||||
|
num_warmup_steps = 0
|
||||||
|
|
||||||
# Bias and layernorm weights are automatically excluded from the decay
|
# Bias and layernorm weights are automatically excluded from the decay
|
||||||
optimizer, lr_schedule = create_optimizer(
|
optimizer, lr_schedule = create_optimizer(
|
||||||
init_lr=training_args.learning_rate,
|
init_lr=training_args.learning_rate,
|
||||||
num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
|
num_train_steps=num_train_steps,
|
||||||
num_warmup_steps=training_args.warmup_steps,
|
num_warmup_steps=num_warmup_steps,
|
||||||
adam_beta1=training_args.adam_beta1,
|
adam_beta1=training_args.adam_beta1,
|
||||||
adam_beta2=training_args.adam_beta2,
|
adam_beta2=training_args.adam_beta2,
|
||||||
adam_epsilon=training_args.adam_epsilon,
|
adam_epsilon=training_args.adam_epsilon,
|
||||||
weight_decay_rate=training_args.weight_decay,
|
weight_decay_rate=training_args.weight_decay,
|
||||||
|
adam_global_clipnorm=training_args.max_grad_norm,
|
||||||
)
|
)
|
||||||
|
|
||||||
# no user-specified loss = will use the model internal loss
|
# no user-specified loss = will use the model internal loss
|
||||||
model.compile(optimizer=optimizer)
|
model.compile(optimizer=optimizer, jit_compile=training_args.xla, run_eagerly=True)
|
||||||
|
# endregion
|
||||||
|
|
||||||
|
# region Preparing push_to_hub and model card
|
||||||
|
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||||
|
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||||
|
if not push_to_hub_model_id:
|
||||||
|
if data_args.dataset_name is not None:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||||
|
else:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-mlm"
|
||||||
|
|
||||||
|
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
|
||||||
|
if data_args.dataset_name is not None:
|
||||||
|
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||||
|
if data_args.dataset_config_name is not None:
|
||||||
|
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||||
|
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||||
|
else:
|
||||||
|
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||||
|
|
||||||
|
if training_args.push_to_hub:
|
||||||
|
callbacks = [
|
||||||
|
PushToHubCallback(
|
||||||
|
output_dir=training_args.output_dir,
|
||||||
|
model_id=push_to_hub_model_id,
|
||||||
|
organization=training_args.push_to_hub_organization,
|
||||||
|
token=training_args.push_to_hub_token,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
**model_card_kwargs,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
callbacks = []
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Training and validation
|
# region Training and validation
|
||||||
@@ -560,33 +591,46 @@ def main():
|
|||||||
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
|
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
|
||||||
logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
|
logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
|
||||||
|
|
||||||
|
# For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints
|
||||||
|
# to the Hugging Face Hub rather than just pushing the finished model.
|
||||||
|
# See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback
|
||||||
|
|
||||||
history = model.fit(
|
history = model.fit(
|
||||||
tf_train_dataset,
|
tf_train_dataset,
|
||||||
validation_data=tf_eval_dataset,
|
validation_data=tf_eval_dataset,
|
||||||
epochs=int(training_args.num_train_epochs),
|
epochs=int(training_args.num_train_epochs),
|
||||||
steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
|
callbacks=callbacks,
|
||||||
callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
|
|
||||||
)
|
)
|
||||||
|
train_loss = history.history["loss"][-1]
|
||||||
try:
|
try:
|
||||||
train_perplexity = math.exp(history.history["loss"][-1])
|
train_perplexity = math.exp(train_loss)
|
||||||
except OverflowError:
|
except OverflowError:
|
||||||
train_perplexity = math.inf
|
train_perplexity = math.inf
|
||||||
try:
|
logger.info(f" Final train loss: {train_loss:.3f}")
|
||||||
validation_perplexity = math.exp(history.history["val_loss"][-1])
|
logger.info(f" Final train perplexity: {train_perplexity:.3f}")
|
||||||
except OverflowError:
|
|
||||||
validation_perplexity = math.inf
|
validation_loss = history.history["val_loss"][-1]
|
||||||
logger.warning(f" Final train loss: {history.history['loss'][-1]:.3f}")
|
try:
|
||||||
logger.warning(f" Final train perplexity: {train_perplexity:.3f}")
|
validation_perplexity = math.exp(validation_loss)
|
||||||
logger.warning(f" Final validation loss: {history.history['val_loss'][-1]:.3f}")
|
except OverflowError:
|
||||||
logger.warning(f" Final validation perplexity: {validation_perplexity:.3f}")
|
validation_perplexity = math.inf
|
||||||
|
logger.info(f" Final validation loss: {validation_loss:.3f}")
|
||||||
|
logger.info(f" Final validation perplexity: {validation_perplexity:.3f}")
|
||||||
|
|
||||||
|
if training_args.output_dir is not None:
|
||||||
|
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||||
|
results_dict = dict()
|
||||||
|
results_dict["train_loss"] = train_loss
|
||||||
|
results_dict["train_perplexity"] = train_perplexity
|
||||||
|
results_dict["eval_loss"] = validation_loss
|
||||||
|
results_dict["eval_perplexity"] = validation_perplexity
|
||||||
|
with open(output_eval_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(results_dict))
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
if training_args.output_dir is not None:
|
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||||
model.save_pretrained(training_args.output_dir)
|
# If we're not pushing to hub, at least save a local copy when we're done
|
||||||
|
model.save_pretrained(training_args.output_dir)
|
||||||
if training_args.push_to_hub:
|
|
||||||
# You'll probably want to append some of your own metadata here!
|
|
||||||
model.push_to_hub()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ Fine-tuning the library models for multiple choice.
|
|||||||
"""
|
"""
|
||||||
# You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.
|
# You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@@ -38,6 +39,7 @@ from transformers import (
|
|||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
DefaultDataCollator,
|
DefaultDataCollator,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
|
PushToHubCallback,
|
||||||
TFAutoModelForMultipleChoice,
|
TFAutoModelForMultipleChoice,
|
||||||
TFTrainingArguments,
|
TFTrainingArguments,
|
||||||
create_optimizer,
|
create_optimizer,
|
||||||
@@ -54,16 +56,6 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
# region Helper classes and functions
|
# region Helper classes and functions
|
||||||
class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
|
||||||
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
|
|
||||||
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
|
|
||||||
# that saves the model with this method after each epoch.
|
|
||||||
def __init__(self, output_dir, **kwargs):
|
|
||||||
super().__init__()
|
|
||||||
self.output_dir = output_dir
|
|
||||||
|
|
||||||
def on_epoch_end(self, epoch, logs=None):
|
|
||||||
self.model.save_pretrained(self.output_dir)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -391,7 +383,6 @@ def main():
|
|||||||
if "train" not in raw_datasets:
|
if "train" not in raw_datasets:
|
||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
non_label_columns = [feature for feature in train_dataset.features if feature not in ("label", "labels")]
|
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
train_dataset = train_dataset.select(range(max_train_samples))
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
@@ -407,8 +398,6 @@ def main():
|
|||||||
if "validation" not in raw_datasets:
|
if "validation" not in raw_datasets:
|
||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if not training_args.do_train:
|
|
||||||
non_label_columns = [feature for feature in eval_dataset.features if feature not in ("label", "labels")]
|
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
@@ -444,79 +433,120 @@ def main():
|
|||||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||||
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||||
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||||
|
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
total_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs)
|
num_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs)
|
||||||
|
if training_args.warmup_steps > 0:
|
||||||
|
num_warmup_steps = training_args.warmup_steps
|
||||||
|
elif training_args.warmup_ratio > 0:
|
||||||
|
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||||
|
else:
|
||||||
|
num_warmup_steps = 0
|
||||||
optimizer, lr_schedule = create_optimizer(
|
optimizer, lr_schedule = create_optimizer(
|
||||||
init_lr=training_args.learning_rate, num_train_steps=int(total_train_steps), num_warmup_steps=0
|
init_lr=training_args.learning_rate,
|
||||||
|
num_train_steps=num_train_steps,
|
||||||
|
num_warmup_steps=num_warmup_steps,
|
||||||
|
adam_beta1=training_args.adam_beta1,
|
||||||
|
adam_beta2=training_args.adam_beta2,
|
||||||
|
adam_epsilon=training_args.adam_epsilon,
|
||||||
|
weight_decay_rate=training_args.weight_decay,
|
||||||
|
adam_global_clipnorm=training_args.max_grad_norm,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
optimizer = "adam" # Just put anything in here, since we're not using it anyway
|
optimizer = None
|
||||||
model.compile(
|
model.compile(optimizer=optimizer, metrics=["accuracy"], jit_compile=training_args.xla)
|
||||||
optimizer=optimizer,
|
# endregion
|
||||||
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
|
|
||||||
metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")],
|
# region Preparing push_to_hub and model card
|
||||||
)
|
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||||
|
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||||
|
if not push_to_hub_model_id:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-multiplechoice"
|
||||||
|
|
||||||
|
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "multiple-choice"}
|
||||||
|
|
||||||
|
if training_args.push_to_hub:
|
||||||
|
callbacks = [
|
||||||
|
PushToHubCallback(
|
||||||
|
output_dir=training_args.output_dir,
|
||||||
|
model_id=push_to_hub_model_id,
|
||||||
|
organization=training_args.push_to_hub_organization,
|
||||||
|
token=training_args.push_to_hub_token,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
**model_card_kwargs,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
callbacks = []
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Training
|
# region Training
|
||||||
|
eval_metrics = None
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
dataset_exclude_cols = set(non_label_columns + ["label"])
|
dataset_options = tf.data.Options()
|
||||||
tf_train_dataset = train_dataset.to_tf_dataset(
|
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||||
columns=[col for col in train_dataset.column_names if col not in dataset_exclude_cols],
|
|
||||||
|
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||||
|
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||||
|
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||||
|
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||||
|
# using model.prepare_tf_dataset()
|
||||||
|
# For more info see the docs:
|
||||||
|
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||||
|
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||||
|
|
||||||
|
tf_train_dataset = model.prepare_tf_dataset(
|
||||||
|
train_dataset,
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
batch_size=total_train_batch_size,
|
batch_size=total_train_batch_size,
|
||||||
collate_fn=data_collator,
|
collate_fn=data_collator,
|
||||||
drop_remainder=True,
|
).with_options(dataset_options)
|
||||||
# `label_cols` is needed for user-defined losses, such as in this example
|
|
||||||
label_cols="label" if "label" in train_dataset.column_names else None,
|
|
||||||
)
|
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
validation_data = eval_dataset.to_tf_dataset(
|
validation_data = model.prepare_tf_dataset(
|
||||||
columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols],
|
eval_dataset,
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
batch_size=total_eval_batch_size,
|
batch_size=total_eval_batch_size,
|
||||||
collate_fn=data_collator,
|
collate_fn=data_collator,
|
||||||
drop_remainder=True,
|
drop_remainder=True,
|
||||||
# `label_cols` is needed for user-defined losses, such as in this example
|
).with_options(dataset_options)
|
||||||
label_cols="label" if "label" in eval_dataset.column_names else None,
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
validation_data = None
|
validation_data = None
|
||||||
model.fit(
|
history = model.fit(
|
||||||
tf_train_dataset,
|
tf_train_dataset,
|
||||||
validation_data=validation_data,
|
validation_data=validation_data,
|
||||||
epochs=int(training_args.num_train_epochs),
|
epochs=int(training_args.num_train_epochs),
|
||||||
callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
|
callbacks=callbacks,
|
||||||
)
|
)
|
||||||
|
eval_metrics = {key: val[-1] for key, val in history.history.items()}
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Evaluation
|
# region Evaluation
|
||||||
if training_args.do_eval and not training_args.do_train:
|
if training_args.do_eval and not training_args.do_train:
|
||||||
dataset_exclude_cols = set(non_label_columns + ["label"])
|
dataset_options = tf.data.Options()
|
||||||
|
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||||
# Do a standalone evaluation pass
|
# Do a standalone evaluation pass
|
||||||
tf_eval_dataset = eval_dataset.to_tf_dataset(
|
tf_eval_dataset = model.prepare_tf_dataset(
|
||||||
columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols],
|
eval_dataset,
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
batch_size=total_eval_batch_size,
|
batch_size=total_eval_batch_size,
|
||||||
collate_fn=data_collator,
|
collate_fn=data_collator,
|
||||||
drop_remainder=True,
|
drop_remainder=True,
|
||||||
# `label_cols` is needed for user-defined losses, such as in this example
|
).with_options(dataset_options)
|
||||||
label_cols="label" if "label" in eval_dataset.column_names else None,
|
eval_results = model.evaluate(tf_eval_dataset)
|
||||||
)
|
eval_metrics = {"val_loss": eval_results[0], "val_accuracy": eval_results[1]}
|
||||||
model.evaluate(tf_eval_dataset)
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
if eval_metrics is not None and training_args.output_dir is not None:
|
||||||
|
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||||
|
with open(output_eval_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(eval_metrics))
|
||||||
|
|
||||||
# region Push to hub
|
# region Push to hub
|
||||||
if training_args.push_to_hub:
|
|
||||||
model.push_to_hub(
|
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||||
finetuned_from=model_args.model_name_or_path,
|
# If we're not pushing to hub, at least save a local copy when we're done
|
||||||
tasks="multiple-choice",
|
model.save_pretrained(training_args.output_dir)
|
||||||
dataset_tags="swag",
|
|
||||||
dataset_args="regular",
|
|
||||||
dataset="SWAG",
|
|
||||||
language="en",
|
|
||||||
)
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ Fine-tuning the library models for question answering.
|
|||||||
"""
|
"""
|
||||||
# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
|
# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@@ -33,13 +34,13 @@ import transformers
|
|||||||
from transformers import (
|
from transformers import (
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
DataCollatorWithPadding,
|
|
||||||
DefaultDataCollator,
|
|
||||||
EvalPrediction,
|
EvalPrediction,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
PreTrainedTokenizerFast,
|
PreTrainedTokenizerFast,
|
||||||
|
PushToHubCallback,
|
||||||
TFAutoModelForQuestionAnswering,
|
TFAutoModelForQuestionAnswering,
|
||||||
TFTrainingArguments,
|
TFTrainingArguments,
|
||||||
|
create_optimizer,
|
||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
|
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
|
||||||
@@ -609,7 +610,12 @@ def main():
|
|||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
with training_args.strategy.scope():
|
with training_args.strategy.scope():
|
||||||
# region Load model
|
|
||||||
|
dataset_options = tf.data.Options()
|
||||||
|
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||||
|
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||||
|
|
||||||
|
# region Load model and prepare datasets
|
||||||
if checkpoint is None:
|
if checkpoint is None:
|
||||||
model_path = model_args.model_name_or_path
|
model_path = model_args.model_name_or_path
|
||||||
else:
|
else:
|
||||||
@@ -621,71 +627,163 @@ def main():
|
|||||||
revision=model_args.model_revision,
|
revision=model_args.model_revision,
|
||||||
use_auth_token=True if model_args.use_auth_token else None,
|
use_auth_token=True if model_args.use_auth_token else None,
|
||||||
)
|
)
|
||||||
optimizer = tf.keras.optimizers.Adam(
|
if training_args.do_train:
|
||||||
learning_rate=training_args.learning_rate,
|
|
||||||
beta_1=training_args.adam_beta1,
|
training_dataset = model.prepare_tf_dataset(
|
||||||
beta_2=training_args.adam_beta2,
|
processed_datasets["train"],
|
||||||
epsilon=training_args.adam_epsilon,
|
shuffle=True,
|
||||||
clipnorm=training_args.max_grad_norm,
|
batch_size=training_args.per_device_train_batch_size * num_replicas,
|
||||||
)
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
|
|
||||||
|
training_dataset = training_dataset.with_options(dataset_options)
|
||||||
|
|
||||||
|
num_train_steps = len(training_dataset) * training_args.num_train_epochs
|
||||||
|
if training_args.warmup_steps > 0:
|
||||||
|
num_warmup_steps = training_args.warmup_steps
|
||||||
|
elif training_args.warmup_ratio > 0:
|
||||||
|
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||||
|
else:
|
||||||
|
num_warmup_steps = 0
|
||||||
|
|
||||||
|
optimizer, schedule = create_optimizer(
|
||||||
|
init_lr=training_args.learning_rate,
|
||||||
|
num_train_steps=len(training_dataset) * training_args.num_train_epochs,
|
||||||
|
num_warmup_steps=num_warmup_steps,
|
||||||
|
adam_beta1=training_args.adam_beta1,
|
||||||
|
adam_beta2=training_args.adam_beta2,
|
||||||
|
adam_epsilon=training_args.adam_epsilon,
|
||||||
|
weight_decay_rate=training_args.weight_decay,
|
||||||
|
adam_global_clipnorm=training_args.max_grad_norm,
|
||||||
|
)
|
||||||
|
|
||||||
|
# no user-specified loss = will use the model internal loss
|
||||||
|
model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"])
|
||||||
|
|
||||||
|
else:
|
||||||
|
model.compile(optimizer=None, jit_compile=training_args.xla, metrics=["accuracy"])
|
||||||
|
training_dataset = None
|
||||||
|
|
||||||
|
if training_args.do_eval:
|
||||||
|
eval_dataset = model.prepare_tf_dataset(
|
||||||
|
processed_datasets["validation"],
|
||||||
|
shuffle=False,
|
||||||
|
batch_size=training_args.per_device_train_batch_size * num_replicas,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
|
eval_dataset = eval_dataset.with_options(dataset_options)
|
||||||
|
else:
|
||||||
|
eval_dataset = None
|
||||||
|
|
||||||
|
if training_args.do_predict:
|
||||||
|
predict_dataset = model.prepare_tf_dataset(
|
||||||
|
processed_datasets["test"],
|
||||||
|
shuffle=False,
|
||||||
|
batch_size=training_args.per_device_eval_batch_size * num_replicas,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
|
predict_dataset = predict_dataset.with_options(dataset_options)
|
||||||
|
else:
|
||||||
|
predict_dataset = None
|
||||||
|
|
||||||
# no user-specified loss = will use the model internal loss
|
|
||||||
model.compile(optimizer=optimizer)
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Training
|
# region Preparing push_to_hub and model card
|
||||||
if padding:
|
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||||
data_collator = DefaultDataCollator(return_tensors="tf")
|
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||||
|
if not push_to_hub_model_id:
|
||||||
|
if data_args.dataset_name is not None:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||||
|
else:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-question-answering"
|
||||||
|
|
||||||
|
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
|
||||||
|
if data_args.dataset_name is not None:
|
||||||
|
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||||
|
if data_args.dataset_config_name is not None:
|
||||||
|
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||||
|
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||||
|
else:
|
||||||
|
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||||
|
|
||||||
|
if training_args.push_to_hub:
|
||||||
|
callbacks = [
|
||||||
|
PushToHubCallback(
|
||||||
|
output_dir=training_args.output_dir,
|
||||||
|
model_id=push_to_hub_model_id,
|
||||||
|
organization=training_args.push_to_hub_organization,
|
||||||
|
token=training_args.push_to_hub_token,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
**model_card_kwargs,
|
||||||
|
)
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
|
callbacks = []
|
||||||
tensor_keys = ["attention_mask", "input_ids"]
|
# endregion
|
||||||
label_keys = ["start_positions", "end_positions"]
|
|
||||||
|
# region Training and Evaluation
|
||||||
|
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
# Make a tf.data.Dataset for this
|
# Note that the validation and test datasets have been processed in a different way to the
|
||||||
training_dataset = processed_datasets["train"].to_tf_dataset(
|
# training datasets in this example, and so they don't have the same label structure.
|
||||||
# labels are passed as input, as we will use the model's internal loss
|
# As such, we don't pass them directly to Keras, but instead get model predictions to evaluate
|
||||||
columns=tensor_keys + label_keys,
|
# after training.
|
||||||
shuffle=True,
|
model.fit(training_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
|
||||||
batch_size=training_args.per_device_train_batch_size,
|
|
||||||
collate_fn=data_collator,
|
|
||||||
drop_remainder=True,
|
|
||||||
)
|
|
||||||
model.fit(training_dataset, epochs=int(training_args.num_train_epochs))
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
# region Evaluation
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
logger.info("*** Evaluation ***")
|
logger.info("*** Evaluation ***")
|
||||||
eval_inputs = {
|
|
||||||
"input_ids": tf.ragged.constant(processed_datasets["validation"]["input_ids"]).to_tensor(),
|
# In this example, we compute advanced metrics at the end of training, but
|
||||||
"attention_mask": tf.ragged.constant(processed_datasets["validation"]["attention_mask"]).to_tensor(),
|
# if you'd like to compute metrics every epoch that are too complex to be written as
|
||||||
}
|
# standard Keras metrics, you can use our KerasMetricCallback. See
|
||||||
eval_predictions = model.predict(eval_inputs)
|
# https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
|
||||||
|
|
||||||
|
eval_predictions = model.predict(eval_dataset)
|
||||||
|
if isinstance(eval_predictions.start_logits, tf.RaggedTensor):
|
||||||
|
# If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
|
||||||
|
# The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
|
||||||
|
# the highest probability in a sample. Instead, we use a large negative value, which ensures that the
|
||||||
|
# padding positions are correctly masked.
|
||||||
|
eval_start_logits = eval_predictions.start_logits.to_tensor(default_value=-1000).numpy()
|
||||||
|
eval_end_logits = eval_predictions.end_logits.to_tensor(default_value=-1000).numpy()
|
||||||
|
else:
|
||||||
|
eval_start_logits = eval_predictions.start_logits
|
||||||
|
eval_end_logits = eval_predictions.end_logits
|
||||||
|
|
||||||
post_processed_eval = post_processing_function(
|
post_processed_eval = post_processing_function(
|
||||||
datasets["validation"],
|
datasets["validation"],
|
||||||
processed_datasets["validation"],
|
processed_datasets["validation"],
|
||||||
(eval_predictions.start_logits, eval_predictions.end_logits),
|
(eval_start_logits, eval_end_logits),
|
||||||
)
|
)
|
||||||
metrics = compute_metrics(post_processed_eval)
|
metrics = compute_metrics(post_processed_eval)
|
||||||
logging.info("Evaluation metrics:")
|
logging.info("Evaluation metrics:")
|
||||||
for metric, value in metrics.items():
|
for metric, value in metrics.items():
|
||||||
logging.info(f"{metric}: {value:.3f}")
|
logging.info(f"{metric}: {value:.3f}")
|
||||||
|
if training_args.output_dir is not None:
|
||||||
|
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||||
|
with open(output_eval_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(metrics))
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Prediction
|
# region Prediction
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
logger.info("*** Predict ***")
|
logger.info("*** Predict ***")
|
||||||
predict_inputs = {
|
|
||||||
"input_ids": tf.ragged.constant(processed_datasets["test"]["input_ids"]).to_tensor(),
|
test_predictions = model.predict(predict_dataset)
|
||||||
"attention_mask": tf.ragged.constant(processed_datasets["test"]["attention_mask"]).to_tensor(),
|
if isinstance(test_predictions.start_logits, tf.RaggedTensor):
|
||||||
}
|
# If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea!
|
||||||
test_predictions = model.predict(predict_inputs)
|
# The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even
|
||||||
|
# the highest probability in a sample. Instead, we use a large negative value, which ensures that the
|
||||||
|
# padding positions are correctly masked.
|
||||||
|
test_start_logits = test_predictions.start_logits.to_tensor(default_value=-1000).numpy()
|
||||||
|
test_end_logits = test_predictions.end_logits.to_tensor(default_value=-1000).numpy()
|
||||||
|
else:
|
||||||
|
test_start_logits = test_predictions.start_logits
|
||||||
|
test_end_logits = test_predictions.end_logits
|
||||||
post_processed_test = post_processing_function(
|
post_processed_test = post_processing_function(
|
||||||
datasets["test"],
|
datasets["test"],
|
||||||
processed_datasets["test"],
|
processed_datasets["test"],
|
||||||
(test_predictions.start_logits, test_predictions.end_logits),
|
(test_start_logits, test_end_logits),
|
||||||
)
|
)
|
||||||
metrics = compute_metrics(post_processed_test)
|
metrics = compute_metrics(post_processed_test)
|
||||||
|
|
||||||
@@ -694,8 +792,9 @@ def main():
|
|||||||
logging.info(f"{metric}: {value:.3f}")
|
logging.info(f"{metric}: {value:.3f}")
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
if training_args.push_to_hub:
|
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||||
model.push_to_hub()
|
# If we're not pushing to hub, at least save a local copy when we're done
|
||||||
|
model.save_pretrained(training_args.output_dir)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -18,11 +18,11 @@ Fine-tuning the library models for summarization.
|
|||||||
"""
|
"""
|
||||||
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
|
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from functools import partial
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
@@ -30,7 +30,6 @@ import nltk # Here to have a nice missing dependency error message early on
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
import evaluate
|
import evaluate
|
||||||
import transformers
|
import transformers
|
||||||
@@ -38,7 +37,10 @@ from filelock import FileLock
|
|||||||
from transformers import (
|
from transformers import (
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
|
DataCollatorForSeq2Seq,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
|
KerasMetricCallback,
|
||||||
|
PushToHubCallback,
|
||||||
TFAutoModelForSeq2SeqLM,
|
TFAutoModelForSeq2SeqLM,
|
||||||
TFTrainingArguments,
|
TFTrainingArguments,
|
||||||
create_optimizer,
|
create_optimizer,
|
||||||
@@ -253,7 +255,6 @@ class DataTrainingArguments:
|
|||||||
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
|
||||||
# region Dataset name mappings
|
# region Dataset name mappings
|
||||||
summarization_name_mapping = {
|
summarization_name_mapping = {
|
||||||
"amazon_reviews_multi": ("review_body", "review_title"),
|
"amazon_reviews_multi": ("review_body", "review_title"),
|
||||||
@@ -272,71 +273,6 @@ summarization_name_mapping = {
|
|||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
|
||||||
# region Data generator
|
|
||||||
def sample_generator(dataset, model, tokenizer, shuffle, pad_to_multiple_of=None):
|
|
||||||
if shuffle:
|
|
||||||
sample_ordering = np.random.permutation(len(dataset))
|
|
||||||
else:
|
|
||||||
sample_ordering = np.arange(len(dataset))
|
|
||||||
for sample_idx in sample_ordering:
|
|
||||||
example = dataset[int(sample_idx)]
|
|
||||||
# Handle dicts with proper padding and conversion to tensor.
|
|
||||||
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
|
|
||||||
example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()}
|
|
||||||
if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
|
|
||||||
decoder_input_ids = model.prepare_decoder_input_ids_from_labels(
|
|
||||||
labels=tf.expand_dims(example["labels"], 0)
|
|
||||||
)
|
|
||||||
example["decoder_input_ids"] = tf.squeeze(decoder_input_ids, 0)
|
|
||||||
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
|
|
||||||
# region Helper functions
|
|
||||||
def dataset_to_tf(dataset, model, tokenizer, total_batch_size, num_epochs, shuffle):
|
|
||||||
if dataset is None:
|
|
||||||
return None
|
|
||||||
train_generator = partial(sample_generator, dataset, model, tokenizer, shuffle=shuffle)
|
|
||||||
train_signature = {
|
|
||||||
feature: tf.TensorSpec(shape=(None,), dtype=tf.int32)
|
|
||||||
for feature in dataset.features
|
|
||||||
if feature != "special_tokens_mask"
|
|
||||||
}
|
|
||||||
if (
|
|
||||||
model is not None
|
|
||||||
and "decoder_input_ids" not in train_signature
|
|
||||||
and hasattr(model, "prepare_decoder_input_ids_from_labels")
|
|
||||||
):
|
|
||||||
train_signature["decoder_input_ids"] = train_signature["labels"]
|
|
||||||
# This may need to be changed depending on your particular model or tokenizer!
|
|
||||||
padding_values = {
|
|
||||||
key: tf.convert_to_tensor(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0, dtype=tf.int32)
|
|
||||||
for key in train_signature.keys()
|
|
||||||
}
|
|
||||||
padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int32)
|
|
||||||
train_signature["labels"] = train_signature["input_ids"]
|
|
||||||
train_signature = (train_signature, train_signature["labels"])
|
|
||||||
options = tf.data.Options()
|
|
||||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
|
||||||
tf_dataset = (
|
|
||||||
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
|
|
||||||
.with_options(options)
|
|
||||||
.padded_batch(
|
|
||||||
batch_size=total_batch_size,
|
|
||||||
drop_remainder=True,
|
|
||||||
padding_values=(padding_values, np.array(-100, dtype=np.int32)),
|
|
||||||
)
|
|
||||||
.repeat(int(num_epochs))
|
|
||||||
)
|
|
||||||
return tf_dataset
|
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# region Argument parsing
|
# region Argument parsing
|
||||||
# See all possible arguments in src/transformers/training_args.py
|
# See all possible arguments in src/transformers/training_args.py
|
||||||
@@ -587,59 +523,148 @@ def main():
|
|||||||
if model.config.decoder_start_token_id is None:
|
if model.config.decoder_start_token_id is None:
|
||||||
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
|
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
|
||||||
|
|
||||||
|
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
||||||
|
data_collator = DataCollatorForSeq2Seq(
|
||||||
|
tokenizer,
|
||||||
|
model=model,
|
||||||
|
label_pad_token_id=label_pad_token_id,
|
||||||
|
pad_to_multiple_of=128, # Reduce the number of unique shapes for XLA, especially for generation
|
||||||
|
return_tensors="tf",
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset_options = tf.data.Options()
|
||||||
|
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||||
|
|
||||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||||
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||||
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||||
tf_train_dataset = dataset_to_tf(
|
|
||||||
|
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||||
|
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||||
|
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||||
|
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||||
|
# using model.prepare_tf_dataset()
|
||||||
|
# For more info see the docs:
|
||||||
|
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||||
|
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||||
|
|
||||||
|
tf_train_dataset = model.prepare_tf_dataset(
|
||||||
train_dataset,
|
train_dataset,
|
||||||
model,
|
collate_fn=data_collator,
|
||||||
tokenizer,
|
batch_size=total_train_batch_size,
|
||||||
total_batch_size=total_train_batch_size,
|
|
||||||
num_epochs=training_args.num_train_epochs,
|
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
)
|
).with_options(dataset_options)
|
||||||
tf_eval_dataset = dataset_to_tf(
|
tf_eval_dataset = model.prepare_tf_dataset(
|
||||||
eval_dataset,
|
eval_dataset,
|
||||||
model,
|
collate_fn=data_collator,
|
||||||
tokenizer,
|
batch_size=total_eval_batch_size,
|
||||||
total_eval_batch_size,
|
|
||||||
num_epochs=1,
|
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
)
|
).with_options(dataset_options)
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Optimizer, loss and LR scheduling
|
# region Optimizer, loss and LR scheduling
|
||||||
# Scheduler and math around the number of training steps.
|
num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
|
||||||
num_update_steps_per_epoch = len(train_dataset) // total_train_batch_size
|
if training_args.warmup_steps > 0:
|
||||||
num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch
|
num_warmup_steps = training_args.warmup_steps
|
||||||
optimizer, lr_schedule = create_optimizer(
|
elif training_args.warmup_ratio > 0:
|
||||||
init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=0
|
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||||
)
|
else:
|
||||||
|
num_warmup_steps = 0
|
||||||
def masked_sparse_categorical_crossentropy(y_true, y_pred):
|
if training_args.do_train:
|
||||||
# We clip the negative labels to 0 to avoid NaNs appearing in the output and
|
optimizer, lr_schedule = create_optimizer(
|
||||||
# fouling up everything that comes afterwards. The loss values corresponding to clipped values
|
init_lr=training_args.learning_rate,
|
||||||
# will be masked later anyway, but even masked NaNs seem to cause overflows for some reason.
|
num_train_steps=num_train_steps,
|
||||||
# 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely
|
num_warmup_steps=num_warmup_steps,
|
||||||
# event that you have more than 1 million tokens in your vocabulary, consider increasing this value.
|
adam_beta1=training_args.adam_beta1,
|
||||||
# More pragmatically, consider redesigning your tokenizer.
|
adam_beta2=training_args.adam_beta2,
|
||||||
losses = tf.keras.losses.sparse_categorical_crossentropy(
|
adam_epsilon=training_args.adam_epsilon,
|
||||||
tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True
|
weight_decay_rate=training_args.weight_decay,
|
||||||
|
adam_global_clipnorm=training_args.max_grad_norm,
|
||||||
)
|
)
|
||||||
# Compute the per-sample loss only over the unmasked tokens
|
else:
|
||||||
losses = tf.ragged.boolean_mask(losses, y_true != -100)
|
optimizer = None
|
||||||
losses = tf.reduce_mean(losses, axis=-1)
|
|
||||||
return losses
|
|
||||||
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Metric
|
# region Metric and KerasMetricCallback
|
||||||
metric = evaluate.load("rouge")
|
if training_args.do_eval:
|
||||||
|
metric = evaluate.load("rouge")
|
||||||
|
|
||||||
|
if data_args.val_max_target_length is None:
|
||||||
|
data_args.val_max_target_length = data_args.max_target_length
|
||||||
|
|
||||||
|
gen_kwargs = {
|
||||||
|
"max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
|
||||||
|
"num_beams": data_args.num_beams,
|
||||||
|
"no_repeat_ngram_size": 0, # Not supported under XLA right now, and some models set it by default
|
||||||
|
}
|
||||||
|
|
||||||
|
def compute_metrics(preds):
|
||||||
|
predictions, labels = preds
|
||||||
|
if isinstance(predictions, tuple):
|
||||||
|
predictions = predictions[0]
|
||||||
|
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
|
||||||
|
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||||
|
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||||
|
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
|
||||||
|
metrics = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
|
||||||
|
# Only print the mid f-measures, but there are a lot of other statistics in there too!
|
||||||
|
metrics = {key: round(val.mid.fmeasure * 100, 4) for key, val in metrics.items()}
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
# The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics
|
||||||
|
# to be computed each epoch. Any Python code can be included in the metric_fn. This is especially
|
||||||
|
# useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs.
|
||||||
|
# For more information, see the docs at
|
||||||
|
# https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback
|
||||||
|
|
||||||
|
metric_callback = KerasMetricCallback(
|
||||||
|
metric_fn=compute_metrics,
|
||||||
|
eval_dataset=tf_eval_dataset,
|
||||||
|
predict_with_generate=True,
|
||||||
|
use_xla_generation=True,
|
||||||
|
generate_kwargs=gen_kwargs,
|
||||||
|
)
|
||||||
|
callbacks = [metric_callback]
|
||||||
|
else:
|
||||||
|
callbacks = []
|
||||||
|
# endregion
|
||||||
|
|
||||||
|
# region Preparing push_to_hub and model card
|
||||||
|
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||||
|
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||||
|
if not push_to_hub_model_id:
|
||||||
|
if data_args.dataset_name is not None:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||||
|
else:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-summarization"
|
||||||
|
|
||||||
|
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"}
|
||||||
|
if data_args.dataset_name is not None:
|
||||||
|
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||||
|
if data_args.dataset_config_name is not None:
|
||||||
|
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||||
|
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||||
|
else:
|
||||||
|
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||||
|
|
||||||
|
if training_args.push_to_hub:
|
||||||
|
# Because this training can be quite long, we save once per epoch.
|
||||||
|
callbacks.append(
|
||||||
|
PushToHubCallback(
|
||||||
|
output_dir=training_args.output_dir,
|
||||||
|
model_id=push_to_hub_model_id,
|
||||||
|
organization=training_args.push_to_hub_organization,
|
||||||
|
token=training_args.push_to_hub_token,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
**model_card_kwargs,
|
||||||
|
)
|
||||||
|
)
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Training
|
# region Training
|
||||||
model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer)
|
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
|
||||||
|
eval_metrics = None
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
logger.info("***** Running training *****")
|
logger.info("***** Running training *****")
|
||||||
logger.info(f" Num examples = {len(train_dataset)}")
|
logger.info(f" Num examples = {len(train_dataset)}")
|
||||||
@@ -648,28 +673,29 @@ def main():
|
|||||||
logger.info(f" Total train batch size = {total_train_batch_size}")
|
logger.info(f" Total train batch size = {total_train_batch_size}")
|
||||||
logger.info(f" Total optimization steps = {num_train_steps}")
|
logger.info(f" Total optimization steps = {num_train_steps}")
|
||||||
|
|
||||||
model.fit(
|
if training_args.xla and not data_args.pad_to_max_length:
|
||||||
tf_train_dataset,
|
logger.warning(
|
||||||
epochs=int(training_args.num_train_epochs),
|
"XLA training may be slow at first when --pad_to_max_length is not set "
|
||||||
steps_per_epoch=num_update_steps_per_epoch,
|
"until all possible shapes have been compiled."
|
||||||
)
|
)
|
||||||
|
history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
|
||||||
|
eval_metrics = {key: val[-1] for key, val in history.history.items()}
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Validation
|
# region Validation
|
||||||
if data_args.val_max_target_length is None:
|
|
||||||
data_args.val_max_target_length = data_args.max_target_length
|
|
||||||
|
|
||||||
gen_kwargs = {
|
if training_args.do_eval and not training_args.do_train:
|
||||||
"max_length": data_args.val_max_target_length if data_args is not None else config.max_length,
|
# Do a standalone evaluation run
|
||||||
"num_beams": data_args.num_beams,
|
|
||||||
}
|
|
||||||
if training_args.do_eval:
|
|
||||||
logger.info("Evaluation...")
|
logger.info("Evaluation...")
|
||||||
for batch, labels in tqdm(
|
|
||||||
tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size
|
# Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate
|
||||||
):
|
@tf.function(jit_compile=True)
|
||||||
|
def generate(**kwargs):
|
||||||
|
return model.generate(**kwargs)
|
||||||
|
|
||||||
|
for batch, labels in tf_eval_dataset:
|
||||||
batch.update(gen_kwargs)
|
batch.update(gen_kwargs)
|
||||||
generated_tokens = model.generate(**batch)
|
generated_tokens = generate(**batch)
|
||||||
if isinstance(generated_tokens, tuple):
|
if isinstance(generated_tokens, tuple):
|
||||||
generated_tokens = generated_tokens[0]
|
generated_tokens = generated_tokens[0]
|
||||||
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
||||||
@@ -679,13 +705,19 @@ def main():
|
|||||||
|
|
||||||
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
|
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
|
||||||
|
|
||||||
result = metric.compute(use_stemmer=True)
|
eval_metrics = metric.compute(use_stemmer=True)
|
||||||
result = {k: round(v * 100, 4) for k, v in result.items()}
|
|
||||||
|
|
||||||
|
result = {key: round(val.mid.fmeasure * 100, 4) for key, val in eval_metrics.items()}
|
||||||
logger.info(result)
|
logger.info(result)
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
if training_args.output_dir is not None:
|
if training_args.output_dir is not None and eval_metrics is not None:
|
||||||
|
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||||
|
with open(output_eval_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(eval_metrics))
|
||||||
|
|
||||||
|
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||||
|
# If we're not pushing to hub, at least save a local copy when we're done
|
||||||
model.save_pretrained(training_args.output_dir)
|
model.save_pretrained(training_args.output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
295
examples/tensorflow/test_tensorflow_examples.py
Normal file
295
examples/tensorflow/test_tensorflow_examples.py
Normal file
@@ -0,0 +1,295 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2022 HuggingFace Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from unittest import skip
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
from transformers.testing_utils import TestCasePlus, get_gpu_count, slow
|
||||||
|
|
||||||
|
|
||||||
|
SRC_DIRS = [
|
||||||
|
os.path.join(os.path.dirname(__file__), dirname)
|
||||||
|
for dirname in [
|
||||||
|
"text-generation",
|
||||||
|
"text-classification",
|
||||||
|
"token-classification",
|
||||||
|
"language-modeling",
|
||||||
|
"multiple-choice",
|
||||||
|
"question-answering",
|
||||||
|
"summarization",
|
||||||
|
"translation",
|
||||||
|
]
|
||||||
|
]
|
||||||
|
sys.path.extend(SRC_DIRS)
|
||||||
|
|
||||||
|
|
||||||
|
if SRC_DIRS is not None:
|
||||||
|
import run_clm
|
||||||
|
import run_mlm
|
||||||
|
import run_ner
|
||||||
|
import run_qa as run_squad
|
||||||
|
import run_summarization
|
||||||
|
import run_swag
|
||||||
|
import run_text_classification
|
||||||
|
import run_translation
|
||||||
|
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
|
def get_setup_file():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("-f")
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args.f
|
||||||
|
|
||||||
|
|
||||||
|
def get_results(output_dir):
|
||||||
|
results = {}
|
||||||
|
path = os.path.join(output_dir, "all_results.json")
|
||||||
|
if os.path.exists(path):
|
||||||
|
with open(path, "r") as f:
|
||||||
|
results = json.load(f)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"can't find {path}")
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def is_cuda_available():
|
||||||
|
return bool(tf.config.list_physical_devices("GPU"))
|
||||||
|
|
||||||
|
|
||||||
|
stream_handler = logging.StreamHandler(sys.stdout)
|
||||||
|
logger.addHandler(stream_handler)
|
||||||
|
|
||||||
|
|
||||||
|
class ExamplesTests(TestCasePlus):
|
||||||
|
@skip("Skipping until shape inference for to_tf_dataset PR is merged.")
|
||||||
|
def test_run_text_classification(self):
|
||||||
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
|
testargs = f"""
|
||||||
|
run_text_classification.py
|
||||||
|
--model_name_or_path distilbert-base-uncased
|
||||||
|
--output_dir {tmp_dir}
|
||||||
|
--overwrite_output_dir
|
||||||
|
--train_file ./tests/fixtures/tests_samples/MRPC/train.csv
|
||||||
|
--validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv
|
||||||
|
--do_train
|
||||||
|
--do_eval
|
||||||
|
--per_device_train_batch_size=2
|
||||||
|
--per_device_eval_batch_size=1
|
||||||
|
--learning_rate=1e-4
|
||||||
|
--max_steps=10
|
||||||
|
--warmup_steps=2
|
||||||
|
--seed=42
|
||||||
|
--max_seq_length=128
|
||||||
|
""".split()
|
||||||
|
|
||||||
|
if is_cuda_available():
|
||||||
|
testargs.append("--fp16")
|
||||||
|
|
||||||
|
with patch.object(sys, "argv", testargs):
|
||||||
|
run_text_classification.main()
|
||||||
|
# Reset the mixed precision policy so we don't break other tests
|
||||||
|
tf.keras.mixed_precision.set_global_policy("float32")
|
||||||
|
result = get_results(tmp_dir)
|
||||||
|
self.assertGreaterEqual(result["eval_accuracy"], 0.75)
|
||||||
|
|
||||||
|
def test_run_clm(self):
|
||||||
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
|
testargs = f"""
|
||||||
|
run_clm.py
|
||||||
|
--model_name_or_path distilgpt2
|
||||||
|
--train_file ./tests/fixtures/sample_text.txt
|
||||||
|
--validation_file ./tests/fixtures/sample_text.txt
|
||||||
|
--do_train
|
||||||
|
--do_eval
|
||||||
|
--block_size 128
|
||||||
|
--per_device_train_batch_size 2
|
||||||
|
--per_device_eval_batch_size 1
|
||||||
|
--num_train_epochs 2
|
||||||
|
--output_dir {tmp_dir}
|
||||||
|
--overwrite_output_dir
|
||||||
|
""".split()
|
||||||
|
|
||||||
|
if len(tf.config.list_physical_devices("GPU")) > 1:
|
||||||
|
# Skipping because there are not enough batches to train the model + would need a drop_last to work.
|
||||||
|
return
|
||||||
|
|
||||||
|
with patch.object(sys, "argv", testargs):
|
||||||
|
run_clm.main()
|
||||||
|
result = get_results(tmp_dir)
|
||||||
|
self.assertLess(result["eval_perplexity"], 100)
|
||||||
|
|
||||||
|
def test_run_mlm(self):
|
||||||
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
|
testargs = f"""
|
||||||
|
run_mlm.py
|
||||||
|
--model_name_or_path distilroberta-base
|
||||||
|
--train_file ./tests/fixtures/sample_text.txt
|
||||||
|
--validation_file ./tests/fixtures/sample_text.txt
|
||||||
|
--max_seq_length 64
|
||||||
|
--output_dir {tmp_dir}
|
||||||
|
--overwrite_output_dir
|
||||||
|
--do_train
|
||||||
|
--do_eval
|
||||||
|
--prediction_loss_only
|
||||||
|
--num_train_epochs=1
|
||||||
|
""".split()
|
||||||
|
|
||||||
|
with patch.object(sys, "argv", testargs):
|
||||||
|
run_mlm.main()
|
||||||
|
result = get_results(tmp_dir)
|
||||||
|
self.assertLess(result["eval_perplexity"], 42)
|
||||||
|
|
||||||
|
def test_run_ner(self):
|
||||||
|
# with so little data distributed training needs more epochs to get the score on par with 0/1 gpu
|
||||||
|
epochs = 7 if get_gpu_count() > 1 else 2
|
||||||
|
|
||||||
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
|
testargs = f"""
|
||||||
|
run_ner.py
|
||||||
|
--model_name_or_path bert-base-uncased
|
||||||
|
--train_file tests/fixtures/tests_samples/conll/sample.json
|
||||||
|
--validation_file tests/fixtures/tests_samples/conll/sample.json
|
||||||
|
--output_dir {tmp_dir}
|
||||||
|
--overwrite_output_dir
|
||||||
|
--do_train
|
||||||
|
--do_eval
|
||||||
|
--warmup_steps=2
|
||||||
|
--learning_rate=2e-4
|
||||||
|
--per_device_train_batch_size=2
|
||||||
|
--per_device_eval_batch_size=2
|
||||||
|
--num_train_epochs={epochs}
|
||||||
|
--seed 7
|
||||||
|
""".split()
|
||||||
|
|
||||||
|
with patch.object(sys, "argv", testargs):
|
||||||
|
run_ner.main()
|
||||||
|
result = get_results(tmp_dir)
|
||||||
|
self.assertGreaterEqual(result["accuracy"], 0.75)
|
||||||
|
|
||||||
|
def test_run_squad(self):
|
||||||
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
|
testargs = f"""
|
||||||
|
run_qa.py
|
||||||
|
--model_name_or_path bert-base-uncased
|
||||||
|
--version_2_with_negative
|
||||||
|
--train_file tests/fixtures/tests_samples/SQUAD/sample.json
|
||||||
|
--validation_file tests/fixtures/tests_samples/SQUAD/sample.json
|
||||||
|
--output_dir {tmp_dir}
|
||||||
|
--overwrite_output_dir
|
||||||
|
--max_steps=10
|
||||||
|
--warmup_steps=2
|
||||||
|
--do_train
|
||||||
|
--do_eval
|
||||||
|
--learning_rate=2e-4
|
||||||
|
--per_device_train_batch_size=2
|
||||||
|
--per_device_eval_batch_size=1
|
||||||
|
""".split()
|
||||||
|
|
||||||
|
with patch.object(sys, "argv", testargs):
|
||||||
|
run_squad.main()
|
||||||
|
result = get_results(tmp_dir)
|
||||||
|
self.assertGreaterEqual(result["f1"], 30)
|
||||||
|
self.assertGreaterEqual(result["exact"], 30)
|
||||||
|
|
||||||
|
def test_run_swag(self):
|
||||||
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
|
testargs = f"""
|
||||||
|
run_swag.py
|
||||||
|
--model_name_or_path bert-base-uncased
|
||||||
|
--train_file tests/fixtures/tests_samples/swag/sample.json
|
||||||
|
--validation_file tests/fixtures/tests_samples/swag/sample.json
|
||||||
|
--output_dir {tmp_dir}
|
||||||
|
--overwrite_output_dir
|
||||||
|
--max_steps=20
|
||||||
|
--warmup_steps=2
|
||||||
|
--do_train
|
||||||
|
--do_eval
|
||||||
|
--learning_rate=2e-4
|
||||||
|
--per_device_train_batch_size=2
|
||||||
|
--per_device_eval_batch_size=1
|
||||||
|
""".split()
|
||||||
|
|
||||||
|
with patch.object(sys, "argv", testargs):
|
||||||
|
run_swag.main()
|
||||||
|
result = get_results(tmp_dir)
|
||||||
|
self.assertGreaterEqual(result["val_accuracy"], 0.8)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_run_summarization(self):
|
||||||
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
|
testargs = f"""
|
||||||
|
run_summarization.py
|
||||||
|
--model_name_or_path t5-small
|
||||||
|
--train_file tests/fixtures/tests_samples/xsum/sample.json
|
||||||
|
--validation_file tests/fixtures/tests_samples/xsum/sample.json
|
||||||
|
--output_dir {tmp_dir}
|
||||||
|
--overwrite_output_dir
|
||||||
|
--max_steps=50
|
||||||
|
--warmup_steps=8
|
||||||
|
--do_train
|
||||||
|
--do_eval
|
||||||
|
--learning_rate=2e-4
|
||||||
|
--per_device_train_batch_size=2
|
||||||
|
--per_device_eval_batch_size=1
|
||||||
|
""".split()
|
||||||
|
|
||||||
|
with patch.object(sys, "argv", testargs):
|
||||||
|
run_summarization.main()
|
||||||
|
result = get_results(tmp_dir)
|
||||||
|
self.assertGreaterEqual(result["rouge1"], 10)
|
||||||
|
self.assertGreaterEqual(result["rouge2"], 2)
|
||||||
|
self.assertGreaterEqual(result["rougeL"], 7)
|
||||||
|
self.assertGreaterEqual(result["rougeLsum"], 7)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_run_translation(self):
|
||||||
|
tmp_dir = self.get_auto_remove_tmp_dir()
|
||||||
|
testargs = f"""
|
||||||
|
run_translation.py
|
||||||
|
--model_name_or_path Rocketknight1/student_marian_en_ro_6_1
|
||||||
|
--source_lang en
|
||||||
|
--target_lang ro
|
||||||
|
--train_file tests/fixtures/tests_samples/wmt16/sample.json
|
||||||
|
--validation_file tests/fixtures/tests_samples/wmt16/sample.json
|
||||||
|
--output_dir {tmp_dir}
|
||||||
|
--overwrite_output_dir
|
||||||
|
--warmup_steps=8
|
||||||
|
--do_train
|
||||||
|
--do_eval
|
||||||
|
--learning_rate=3e-3
|
||||||
|
--num_train_epochs 12
|
||||||
|
--per_device_train_batch_size=2
|
||||||
|
--per_device_eval_batch_size=1
|
||||||
|
--source_lang en_XX
|
||||||
|
--target_lang ro_RO
|
||||||
|
""".split()
|
||||||
|
|
||||||
|
with patch.object(sys, "argv", testargs):
|
||||||
|
run_translation.main()
|
||||||
|
result = get_results(tmp_dir)
|
||||||
|
self.assertGreaterEqual(result["bleu"], 30)
|
||||||
@@ -16,6 +16,7 @@
|
|||||||
""" Finetuning the library models for sequence classification on GLUE."""
|
""" Finetuning the library models for sequence classification on GLUE."""
|
||||||
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
|
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@@ -35,32 +36,16 @@ from transformers import (
|
|||||||
DefaultDataCollator,
|
DefaultDataCollator,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
PretrainedConfig,
|
PretrainedConfig,
|
||||||
|
PushToHubCallback,
|
||||||
TFAutoModelForSequenceClassification,
|
TFAutoModelForSequenceClassification,
|
||||||
TFTrainingArguments,
|
TFTrainingArguments,
|
||||||
|
create_optimizer,
|
||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
from transformers.trainer_utils import get_last_checkpoint, is_main_process
|
from transformers.trainer_utils import get_last_checkpoint, is_main_process
|
||||||
from transformers.utils import check_min_version, send_example_telemetry
|
from transformers.utils import check_min_version, send_example_telemetry
|
||||||
|
|
||||||
|
|
||||||
# region Helper functions
|
|
||||||
|
|
||||||
|
|
||||||
class SavePretrainedCallback(tf.keras.callbacks.Callback):
|
|
||||||
# Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
|
|
||||||
# metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
|
|
||||||
# that saves the model with this method after each epoch.
|
|
||||||
def __init__(self, output_dir, **kwargs):
|
|
||||||
super().__init__()
|
|
||||||
self.output_dir = output_dir
|
|
||||||
|
|
||||||
def on_epoch_end(self, epoch, logs=None):
|
|
||||||
self.model.save_pretrained(self.output_dir)
|
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
|
|
||||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||||
check_min_version("4.22.0.dev0")
|
check_min_version("4.22.0.dev0")
|
||||||
|
|
||||||
@@ -312,7 +297,6 @@ def main():
|
|||||||
|
|
||||||
# region Dataset preprocessing
|
# region Dataset preprocessing
|
||||||
sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
|
sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
|
||||||
non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
|
|
||||||
|
|
||||||
# Padding strategy
|
# Padding strategy
|
||||||
if data_args.pad_to_max_length:
|
if data_args.pad_to_max_length:
|
||||||
@@ -394,24 +378,11 @@ def main():
|
|||||||
)
|
)
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Optimizer, loss and compilation
|
|
||||||
optimizer = tf.keras.optimizers.Adam(
|
|
||||||
learning_rate=training_args.learning_rate,
|
|
||||||
beta_1=training_args.adam_beta1,
|
|
||||||
beta_2=training_args.adam_beta2,
|
|
||||||
epsilon=training_args.adam_epsilon,
|
|
||||||
clipnorm=training_args.max_grad_norm,
|
|
||||||
)
|
|
||||||
if is_regression:
|
|
||||||
loss_fn = tf.keras.losses.MeanSquaredError()
|
|
||||||
metrics = []
|
|
||||||
else:
|
|
||||||
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
|
||||||
metrics = ["accuracy"]
|
|
||||||
model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
# region Convert data to a tf.data.Dataset
|
# region Convert data to a tf.data.Dataset
|
||||||
|
dataset_options = tf.data.Options()
|
||||||
|
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||||
|
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||||
|
|
||||||
tf_data = dict()
|
tf_data = dict()
|
||||||
max_samples = {
|
max_samples = {
|
||||||
"train": data_args.max_train_samples,
|
"train": data_args.max_train_samples,
|
||||||
@@ -428,31 +399,89 @@ def main():
|
|||||||
assert "label" in datasets[key].features, f"Missing labels from {key} data!"
|
assert "label" in datasets[key].features, f"Missing labels from {key} data!"
|
||||||
if key == "train":
|
if key == "train":
|
||||||
shuffle = True
|
shuffle = True
|
||||||
batch_size = training_args.per_device_train_batch_size
|
batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||||
drop_remainder = True # Saves us worrying about scaling gradients for the last batch
|
|
||||||
else:
|
else:
|
||||||
shuffle = False
|
shuffle = False
|
||||||
batch_size = training_args.per_device_eval_batch_size
|
batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||||
drop_remainder = False
|
|
||||||
samples_limit = max_samples[key]
|
samples_limit = max_samples[key]
|
||||||
dataset = datasets[key]
|
dataset = datasets[key]
|
||||||
if samples_limit is not None:
|
if samples_limit is not None:
|
||||||
dataset = dataset.select(range(samples_limit))
|
dataset = dataset.select(range(samples_limit))
|
||||||
data = dataset.to_tf_dataset(
|
|
||||||
columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
|
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||||
|
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||||
|
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||||
|
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||||
|
# using model.prepare_tf_dataset()
|
||||||
|
# For more info see the docs:
|
||||||
|
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||||
|
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||||
|
data = model.prepare_tf_dataset(
|
||||||
|
dataset,
|
||||||
shuffle=shuffle,
|
shuffle=shuffle,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
collate_fn=data_collator,
|
collate_fn=data_collator,
|
||||||
drop_remainder=drop_remainder,
|
tokenizer=tokenizer,
|
||||||
# `label_cols` is needed for user-defined losses, such as in this example
|
|
||||||
label_cols="label" if "label" in dataset.column_names else None,
|
|
||||||
)
|
)
|
||||||
|
data = data.with_options(dataset_options)
|
||||||
tf_data[key] = data
|
tf_data[key] = data
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
# region Optimizer, loss and compilation
|
||||||
|
if training_args.do_train:
|
||||||
|
num_train_steps = len(tf_data["train"]) * training_args.num_train_epochs
|
||||||
|
if training_args.warmup_steps > 0:
|
||||||
|
num_warmup_steps = training_args.warmup_steps
|
||||||
|
elif training_args.warmup_ratio > 0:
|
||||||
|
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||||
|
else:
|
||||||
|
num_warmup_steps = 0
|
||||||
|
|
||||||
|
optimizer, schedule = create_optimizer(
|
||||||
|
init_lr=training_args.learning_rate,
|
||||||
|
num_train_steps=num_train_steps,
|
||||||
|
num_warmup_steps=num_warmup_steps,
|
||||||
|
adam_beta1=training_args.adam_beta1,
|
||||||
|
adam_beta2=training_args.adam_beta2,
|
||||||
|
adam_epsilon=training_args.adam_epsilon,
|
||||||
|
weight_decay_rate=training_args.weight_decay,
|
||||||
|
adam_global_clipnorm=training_args.max_grad_norm,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
optimizer = "adam" # Just write anything because we won't be using it
|
||||||
|
if is_regression:
|
||||||
|
metrics = []
|
||||||
|
else:
|
||||||
|
metrics = ["accuracy"]
|
||||||
|
model.compile(optimizer=optimizer, metrics=metrics, jit_compile=training_args.xla)
|
||||||
|
# endregion
|
||||||
|
|
||||||
|
# region Preparing push_to_hub and model card
|
||||||
|
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||||
|
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||||
|
if not push_to_hub_model_id:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-glue"
|
||||||
|
|
||||||
|
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
|
||||||
|
model_card_kwargs["task_name"] = data_args.task_name
|
||||||
|
|
||||||
|
if training_args.push_to_hub:
|
||||||
|
callbacks = [
|
||||||
|
PushToHubCallback(
|
||||||
|
output_dir=training_args.output_dir,
|
||||||
|
model_id=push_to_hub_model_id,
|
||||||
|
organization=training_args.push_to_hub_organization,
|
||||||
|
token=training_args.push_to_hub_token,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
**model_card_kwargs,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
callbacks = []
|
||||||
|
# endregion
|
||||||
|
|
||||||
# region Training and validation
|
# region Training and validation
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)]
|
|
||||||
if training_args.do_eval and not data_args.task_name == "mnli":
|
if training_args.do_eval and not data_args.task_name == "mnli":
|
||||||
# Do both evaluation and training in the Keras fit loop, unless the task is MNLI
|
# Do both evaluation and training in the Keras fit loop, unless the task is MNLI
|
||||||
# because MNLI has two validation sets
|
# because MNLI has two validation sets
|
||||||
@@ -472,6 +501,12 @@ def main():
|
|||||||
# We normally do validation as part of the Keras fit loop, but we run it independently
|
# We normally do validation as part of the Keras fit loop, but we run it independently
|
||||||
# if there was no fit() step (because we didn't train the model) or if the task is MNLI,
|
# if there was no fit() step (because we didn't train the model) or if the task is MNLI,
|
||||||
# because MNLI has a separate validation-mismatched validation set
|
# because MNLI has a separate validation-mismatched validation set
|
||||||
|
|
||||||
|
# In this example, we compute advanced metrics only at the end of training, and only compute
|
||||||
|
# loss and accuracy on the validation set each epoch, but
|
||||||
|
# if you'd like to compute metrics every epoch that are too complex to be written as
|
||||||
|
# standard Keras metrics, you can use our KerasMetricCallback. See
|
||||||
|
# https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks
|
||||||
logger.info("*** Evaluate ***")
|
logger.info("*** Evaluate ***")
|
||||||
|
|
||||||
# Loop to handle MNLI double evaluation (matched, mis-matched)
|
# Loop to handle MNLI double evaluation (matched, mis-matched)
|
||||||
@@ -489,6 +524,10 @@ def main():
|
|||||||
eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"])
|
eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"])
|
||||||
print(f"Evaluation metrics ({task}):")
|
print(f"Evaluation metrics ({task}):")
|
||||||
print(eval_metrics)
|
print(eval_metrics)
|
||||||
|
if training_args.output_dir is not None:
|
||||||
|
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||||
|
with open(output_eval_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(eval_metrics))
|
||||||
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
@@ -538,6 +577,10 @@ def main():
|
|||||||
writer.write(f"{index}\t{item}\n")
|
writer.write(f"{index}\t{item}\n")
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||||
|
# If we're not pushing to hub, at least save a local copy when we're done
|
||||||
|
model.save_pretrained(training_args.output_dir)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
""" Fine-tuning the library models for sequence classification."""
|
""" Fine-tuning the library models for sequence classification."""
|
||||||
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
|
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@@ -29,12 +30,12 @@ from datasets import load_dataset
|
|||||||
from transformers import (
|
from transformers import (
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
DataCollatorWithPadding,
|
|
||||||
DefaultDataCollator,
|
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
PretrainedConfig,
|
PretrainedConfig,
|
||||||
|
PushToHubCallback,
|
||||||
TFAutoModelForSequenceClassification,
|
TFAutoModelForSequenceClassification,
|
||||||
TFTrainingArguments,
|
TFTrainingArguments,
|
||||||
|
create_optimizer,
|
||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, send_example_telemetry
|
from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, send_example_telemetry
|
||||||
@@ -383,10 +384,6 @@ def main():
|
|||||||
|
|
||||||
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
|
datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache)
|
||||||
|
|
||||||
if data_args.pad_to_max_length:
|
|
||||||
data_collator = DefaultDataCollator(return_tensors="tf")
|
|
||||||
else:
|
|
||||||
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
with training_args.strategy.scope():
|
with training_args.strategy.scope():
|
||||||
@@ -409,24 +406,10 @@ def main():
|
|||||||
)
|
)
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Optimizer, loss and compilation
|
|
||||||
optimizer = tf.keras.optimizers.Adam(
|
|
||||||
learning_rate=training_args.learning_rate,
|
|
||||||
beta_1=training_args.adam_beta1,
|
|
||||||
beta_2=training_args.adam_beta2,
|
|
||||||
epsilon=training_args.adam_epsilon,
|
|
||||||
clipnorm=training_args.max_grad_norm,
|
|
||||||
)
|
|
||||||
if is_regression:
|
|
||||||
loss_fn = tf.keras.losses.MeanSquaredError()
|
|
||||||
metrics = []
|
|
||||||
else:
|
|
||||||
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
|
|
||||||
metrics = ["accuracy"]
|
|
||||||
model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
# region Convert data to a tf.data.Dataset
|
# region Convert data to a tf.data.Dataset
|
||||||
|
dataset_options = tf.data.Options()
|
||||||
|
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||||
|
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||||
|
|
||||||
tf_data = dict()
|
tf_data = dict()
|
||||||
max_samples = {
|
max_samples = {
|
||||||
@@ -438,50 +421,121 @@ def main():
|
|||||||
if key not in datasets:
|
if key not in datasets:
|
||||||
tf_data[key] = None
|
tf_data[key] = None
|
||||||
continue
|
continue
|
||||||
|
if (
|
||||||
|
(key == "train" and not training_args.do_train)
|
||||||
|
or (key == "validation" and not training_args.do_eval)
|
||||||
|
or (key == "test" and not training_args.do_predict)
|
||||||
|
):
|
||||||
|
tf_data[key] = None
|
||||||
|
continue
|
||||||
if key in ("train", "validation"):
|
if key in ("train", "validation"):
|
||||||
assert "label" in datasets[key].features, f"Missing labels from {key} data!"
|
assert "label" in datasets[key].features, f"Missing labels from {key} data!"
|
||||||
if key == "train":
|
if key == "train":
|
||||||
shuffle = True
|
shuffle = True
|
||||||
batch_size = training_args.per_device_train_batch_size
|
batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||||
drop_remainder = True # Saves us worrying about scaling gradients for the last batch
|
|
||||||
else:
|
else:
|
||||||
shuffle = False
|
shuffle = False
|
||||||
batch_size = training_args.per_device_eval_batch_size
|
batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||||
drop_remainder = False
|
|
||||||
samples_limit = max_samples[key]
|
samples_limit = max_samples[key]
|
||||||
dataset = datasets[key]
|
dataset = datasets[key]
|
||||||
if samples_limit is not None:
|
if samples_limit is not None:
|
||||||
dataset = dataset.select(range(samples_limit))
|
dataset = dataset.select(range(samples_limit))
|
||||||
data = dataset.to_tf_dataset(
|
|
||||||
columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])],
|
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||||
|
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||||
|
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||||
|
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||||
|
# using model.prepare_tf_dataset()
|
||||||
|
# For more info see the docs:
|
||||||
|
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||||
|
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||||
|
|
||||||
|
data = model.prepare_tf_dataset(
|
||||||
|
dataset,
|
||||||
shuffle=shuffle,
|
shuffle=shuffle,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
collate_fn=data_collator,
|
tokenizer=tokenizer,
|
||||||
drop_remainder=drop_remainder,
|
|
||||||
# `label_cols` is needed for user-defined losses, such as in this example
|
|
||||||
label_cols="label" if "label" in dataset.column_names else None,
|
|
||||||
)
|
)
|
||||||
|
data = data.with_options(dataset_options)
|
||||||
tf_data[key] = data
|
tf_data[key] = data
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
# region Optimizer, loss and compilation
|
||||||
|
|
||||||
|
if training_args.do_train:
|
||||||
|
num_train_steps = len(tf_data["train"]) * training_args.num_train_epochs
|
||||||
|
if training_args.warmup_steps > 0:
|
||||||
|
num_warmup_steps = training_args.warmup_steps
|
||||||
|
elif training_args.warmup_ratio > 0:
|
||||||
|
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||||
|
else:
|
||||||
|
num_warmup_steps = 0
|
||||||
|
|
||||||
|
optimizer, schedule = create_optimizer(
|
||||||
|
init_lr=training_args.learning_rate,
|
||||||
|
num_train_steps=num_train_steps,
|
||||||
|
num_warmup_steps=num_warmup_steps,
|
||||||
|
adam_beta1=training_args.adam_beta1,
|
||||||
|
adam_beta2=training_args.adam_beta2,
|
||||||
|
adam_epsilon=training_args.adam_epsilon,
|
||||||
|
weight_decay_rate=training_args.weight_decay,
|
||||||
|
adam_global_clipnorm=training_args.max_grad_norm,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
optimizer = None
|
||||||
|
if is_regression:
|
||||||
|
metrics = []
|
||||||
|
else:
|
||||||
|
metrics = ["accuracy"]
|
||||||
|
model.compile(optimizer=optimizer, metrics=metrics)
|
||||||
|
# endregion
|
||||||
|
|
||||||
|
# region Preparing push_to_hub and model card
|
||||||
|
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||||
|
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||||
|
if not push_to_hub_model_id:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-text-classification"
|
||||||
|
|
||||||
|
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
|
||||||
|
|
||||||
|
if training_args.push_to_hub:
|
||||||
|
callbacks = [
|
||||||
|
PushToHubCallback(
|
||||||
|
output_dir=training_args.output_dir,
|
||||||
|
model_id=push_to_hub_model_id,
|
||||||
|
organization=training_args.push_to_hub_organization,
|
||||||
|
token=training_args.push_to_hub_token,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
**model_card_kwargs,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
callbacks = []
|
||||||
|
# endregion
|
||||||
|
|
||||||
# region Training and validation
|
# region Training and validation
|
||||||
if tf_data["train"] is not None:
|
if tf_data["train"] is not None:
|
||||||
callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)]
|
|
||||||
model.fit(
|
model.fit(
|
||||||
tf_data["train"],
|
tf_data["train"],
|
||||||
validation_data=tf_data["validation"],
|
validation_data=tf_data["validation"],
|
||||||
epochs=int(training_args.num_train_epochs),
|
epochs=int(training_args.num_train_epochs),
|
||||||
callbacks=callbacks,
|
callbacks=callbacks,
|
||||||
)
|
)
|
||||||
elif tf_data["validation"] is not None:
|
if tf_data["validation"] is not None:
|
||||||
# If there's a validation dataset but no training set, just evaluate the metrics
|
|
||||||
logger.info("Computing metrics on validation data...")
|
logger.info("Computing metrics on validation data...")
|
||||||
if is_regression:
|
if is_regression:
|
||||||
loss = model.evaluate(tf_data["validation"])
|
loss = model.evaluate(tf_data["validation"])
|
||||||
logger.info(f"Loss: {loss:.5f}")
|
logger.info(f"Eval loss: {loss:.5f}")
|
||||||
else:
|
else:
|
||||||
loss, accuracy = model.evaluate(tf_data["validation"])
|
loss, accuracy = model.evaluate(tf_data["validation"])
|
||||||
logger.info(f"Loss: {loss:.5f}, Accuracy: {accuracy * 100:.4f}%")
|
logger.info(f"Eval loss: {loss:.5f}, Eval accuracy: {accuracy * 100:.4f}%")
|
||||||
|
if training_args.output_dir is not None:
|
||||||
|
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||||
|
eval_dict = {"eval_loss": loss}
|
||||||
|
if not is_regression:
|
||||||
|
eval_dict["eval_accuracy"] = accuracy
|
||||||
|
with open(output_eval_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(eval_dict))
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Prediction
|
# region Prediction
|
||||||
@@ -501,14 +555,9 @@ def main():
|
|||||||
logger.info(f"Wrote predictions to {output_test_file}!")
|
logger.info(f"Wrote predictions to {output_test_file}!")
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Prediction losses
|
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||||
# This section is outside the scope() because it's very quick to compute, but behaves badly inside it
|
# If we're not pushing to hub, at least save a local copy when we're done
|
||||||
if "test" in datasets and "label" in datasets["test"].features:
|
model.save_pretrained(training_args.output_dir)
|
||||||
print("Computing prediction loss on test labels...")
|
|
||||||
labels = datasets["test"]["label"]
|
|
||||||
loss = float(loss_fn(labels, predictions).numpy())
|
|
||||||
print(f"Test loss: {loss:.4f}")
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -18,14 +18,14 @@ Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, C
|
|||||||
without using a Trainer.
|
without using a Trainer.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from functools import partial
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from datasets import ClassLabel, load_dataset
|
from datasets import ClassLabel, load_dataset
|
||||||
|
|
||||||
@@ -33,10 +33,11 @@ import evaluate
|
|||||||
import transformers
|
import transformers
|
||||||
from transformers import (
|
from transformers import (
|
||||||
CONFIG_MAPPING,
|
CONFIG_MAPPING,
|
||||||
MODEL_MAPPING,
|
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
|
DataCollatorForTokenClassification,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
|
PushToHubCallback,
|
||||||
TFAutoModelForTokenClassification,
|
TFAutoModelForTokenClassification,
|
||||||
TFTrainingArguments,
|
TFTrainingArguments,
|
||||||
create_optimizer,
|
create_optimizer,
|
||||||
@@ -48,11 +49,7 @@ from transformers.utils.versions import require_version
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.addHandler(logging.StreamHandler())
|
logger.addHandler(logging.StreamHandler())
|
||||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
|
require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/token-classification/requirements.txt")
|
||||||
|
|
||||||
# You should update this to your particular problem to have better documentation of `model_type`
|
|
||||||
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
|
||||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
|
||||||
|
|
||||||
|
|
||||||
# region Command-line arguments
|
# region Command-line arguments
|
||||||
@@ -195,61 +192,6 @@ class DataTrainingArguments:
|
|||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
|
||||||
# region Data generator
|
|
||||||
def sample_generator(dataset, tokenizer, shuffle, pad_to_multiple_of=None):
|
|
||||||
# Trim off the last partial batch if present
|
|
||||||
if shuffle:
|
|
||||||
sample_ordering = np.random.permutation(len(dataset))
|
|
||||||
else:
|
|
||||||
sample_ordering = np.arange(len(dataset))
|
|
||||||
for sample_idx in sample_ordering:
|
|
||||||
example = dataset[int(sample_idx)]
|
|
||||||
# Handle dicts with proper padding and conversion to tensor.
|
|
||||||
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
|
|
||||||
if tokenizer.pad_token_id is not None:
|
|
||||||
example["labels"][example["attention_mask"] == 0] = -100
|
|
||||||
example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()}
|
|
||||||
|
|
||||||
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
|
|
||||||
# region Helper functions
|
|
||||||
def dataset_to_tf(dataset, tokenizer, total_batch_size, num_epochs, shuffle):
|
|
||||||
train_generator = partial(sample_generator, dataset, tokenizer, shuffle=shuffle)
|
|
||||||
train_signature = {
|
|
||||||
feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
|
|
||||||
for feature in dataset.features
|
|
||||||
if feature != "special_tokens_mask"
|
|
||||||
}
|
|
||||||
# This may need to be changed depending on your particular model or tokenizer!
|
|
||||||
padding_values = {key: tf.convert_to_tensor(0, dtype=tf.int64) for key in dataset.features}
|
|
||||||
padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int64)
|
|
||||||
if tokenizer.pad_token_id is not None:
|
|
||||||
padding_values["input_ids"] = tf.convert_to_tensor(tokenizer.pad_token_id, dtype=tf.int64)
|
|
||||||
train_signature["labels"] = train_signature["input_ids"]
|
|
||||||
train_signature = (train_signature, train_signature["labels"])
|
|
||||||
options = tf.data.Options()
|
|
||||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
|
||||||
tf_dataset = (
|
|
||||||
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
|
|
||||||
.with_options(options)
|
|
||||||
.padded_batch(
|
|
||||||
batch_size=total_batch_size,
|
|
||||||
drop_remainder=True,
|
|
||||||
padding_values=(padding_values, np.array(0, dtype=np.int64)),
|
|
||||||
)
|
|
||||||
.repeat(int(num_epochs))
|
|
||||||
)
|
|
||||||
return tf_dataset
|
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# region Argument Parsing
|
# region Argument Parsing
|
||||||
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
|
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
|
||||||
@@ -419,6 +361,14 @@ def main():
|
|||||||
train_dataset = processed_raw_datasets["train"]
|
train_dataset = processed_raw_datasets["train"]
|
||||||
eval_dataset = processed_raw_datasets["validation"]
|
eval_dataset = processed_raw_datasets["validation"]
|
||||||
|
|
||||||
|
if data_args.max_train_samples is not None:
|
||||||
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
|
if data_args.max_eval_samples is not None:
|
||||||
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
# Log a few random samples from the training set:
|
# Log a few random samples from the training set:
|
||||||
for index in random.sample(range(len(train_dataset)), 3):
|
for index in random.sample(range(len(train_dataset)), 3):
|
||||||
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
||||||
@@ -439,43 +389,62 @@ def main():
|
|||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Create TF datasets
|
# region Create TF datasets
|
||||||
|
|
||||||
|
# We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as
|
||||||
|
# well as inputs.
|
||||||
|
collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
|
||||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||||
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||||
train_batches_per_epoch = len(train_dataset) // total_train_batch_size
|
|
||||||
tf_train_dataset = dataset_to_tf(
|
dataset_options = tf.data.Options()
|
||||||
|
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||||
|
|
||||||
|
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||||
|
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||||
|
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||||
|
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||||
|
# using model.prepare_tf_dataset()
|
||||||
|
# For more info see the docs:
|
||||||
|
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||||
|
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||||
|
|
||||||
|
tf_train_dataset = model.prepare_tf_dataset(
|
||||||
train_dataset,
|
train_dataset,
|
||||||
tokenizer,
|
collate_fn=collate_fn,
|
||||||
total_batch_size=total_train_batch_size,
|
batch_size=total_train_batch_size,
|
||||||
num_epochs=training_args.num_train_epochs,
|
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
)
|
).with_options(dataset_options)
|
||||||
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||||
eval_batches_per_epoch = len(eval_dataset) // total_eval_batch_size
|
tf_eval_dataset = model.prepare_tf_dataset(
|
||||||
tf_eval_dataset = dataset_to_tf(
|
|
||||||
eval_dataset,
|
eval_dataset,
|
||||||
tokenizer,
|
collate_fn=collate_fn,
|
||||||
total_batch_size=total_eval_batch_size,
|
batch_size=total_eval_batch_size,
|
||||||
num_epochs=training_args.num_train_epochs,
|
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
)
|
).with_options(dataset_options)
|
||||||
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Optimizer, loss and compilation
|
# region Optimizer, loss and compilation
|
||||||
|
num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
|
||||||
|
if training_args.warmup_steps > 0:
|
||||||
|
num_warmup_steps = training_args.warmup_steps
|
||||||
|
elif training_args.warmup_ratio > 0:
|
||||||
|
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||||
|
else:
|
||||||
|
num_warmup_steps = 0
|
||||||
|
|
||||||
optimizer, lr_schedule = create_optimizer(
|
optimizer, lr_schedule = create_optimizer(
|
||||||
init_lr=training_args.learning_rate,
|
init_lr=training_args.learning_rate,
|
||||||
num_train_steps=int(training_args.num_train_epochs * train_batches_per_epoch),
|
num_train_steps=num_train_steps,
|
||||||
num_warmup_steps=training_args.warmup_steps,
|
num_warmup_steps=num_warmup_steps,
|
||||||
adam_beta1=training_args.adam_beta1,
|
adam_beta1=training_args.adam_beta1,
|
||||||
adam_beta2=training_args.adam_beta2,
|
adam_beta2=training_args.adam_beta2,
|
||||||
adam_epsilon=training_args.adam_epsilon,
|
adam_epsilon=training_args.adam_epsilon,
|
||||||
weight_decay_rate=training_args.weight_decay,
|
weight_decay_rate=training_args.weight_decay,
|
||||||
|
adam_global_clipnorm=training_args.max_grad_norm,
|
||||||
)
|
)
|
||||||
|
|
||||||
def dummy_loss(y_true, y_pred):
|
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
|
||||||
return tf.reduce_mean(y_pred)
|
|
||||||
|
|
||||||
model.compile(loss={"loss": dummy_loss}, optimizer=optimizer)
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# Metrics
|
# Metrics
|
||||||
@@ -517,6 +486,39 @@ def main():
|
|||||||
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
# region Preparing push_to_hub and model card
|
||||||
|
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||||
|
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||||
|
if not push_to_hub_model_id:
|
||||||
|
if data_args.dataset_name is not None:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
|
||||||
|
else:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-token-classification"
|
||||||
|
|
||||||
|
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "token-classification"}
|
||||||
|
if data_args.dataset_name is not None:
|
||||||
|
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||||
|
if data_args.dataset_config_name is not None:
|
||||||
|
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||||
|
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||||
|
else:
|
||||||
|
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||||
|
|
||||||
|
if training_args.push_to_hub:
|
||||||
|
callbacks = [
|
||||||
|
PushToHubCallback(
|
||||||
|
output_dir=training_args.output_dir,
|
||||||
|
model_id=push_to_hub_model_id,
|
||||||
|
organization=training_args.push_to_hub_organization,
|
||||||
|
token=training_args.push_to_hub_token,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
**model_card_kwargs,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
callbacks = []
|
||||||
|
# endregion
|
||||||
|
|
||||||
# region Training
|
# region Training
|
||||||
logger.info("***** Running training *****")
|
logger.info("***** Running training *****")
|
||||||
logger.info(f" Num examples = {len(train_dataset)}")
|
logger.info(f" Num examples = {len(train_dataset)}")
|
||||||
@@ -524,23 +526,43 @@ def main():
|
|||||||
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
|
logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
|
||||||
logger.info(f" Total train batch size = {total_train_batch_size}")
|
logger.info(f" Total train batch size = {total_train_batch_size}")
|
||||||
# Only show the progress bar once on each machine.
|
# Only show the progress bar once on each machine.
|
||||||
|
|
||||||
model.fit(
|
model.fit(
|
||||||
tf_train_dataset,
|
tf_train_dataset,
|
||||||
validation_data=tf_eval_dataset,
|
validation_data=tf_eval_dataset,
|
||||||
epochs=int(training_args.num_train_epochs),
|
epochs=int(training_args.num_train_epochs),
|
||||||
steps_per_epoch=train_batches_per_epoch,
|
callbacks=callbacks,
|
||||||
validation_steps=eval_batches_per_epoch,
|
|
||||||
)
|
)
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Predictions
|
# region Predictions
|
||||||
# For predictions, we preload the entire validation set - note that if you have a really giant validation
|
# If you have variable batch sizes (i.e. not using pad_to_max_length), then
|
||||||
# set, you might need to change this!
|
# this bit might fail on TF < 2.8 because TF can't concatenate outputs of varying seq
|
||||||
eval_inputs = {key: tf.ragged.constant(eval_dataset[key]).to_tensor() for key in eval_dataset.features}
|
# length from predict().
|
||||||
predictions = model.predict(eval_inputs, batch_size=training_args.per_device_eval_batch_size)["logits"]
|
|
||||||
predictions = tf.math.argmax(predictions, axis=-1)
|
try:
|
||||||
labels = np.array(eval_inputs["labels"])
|
predictions = model.predict(tf_eval_dataset, batch_size=training_args.per_device_eval_batch_size)["logits"]
|
||||||
labels[np.array(eval_inputs["attention_mask"]) == 0] = -100
|
except tf.python.framework.errors_impl.InvalidArgumentError:
|
||||||
|
raise ValueError(
|
||||||
|
"Concatenating predictions failed! If your version of TensorFlow is 2.8.0 or older "
|
||||||
|
"then you will need to use --pad_to_max_length to generate predictions, as older "
|
||||||
|
"versions of TensorFlow cannot concatenate variable-length predictions as RaggedTensor."
|
||||||
|
)
|
||||||
|
if isinstance(predictions, tf.RaggedTensor):
|
||||||
|
predictions = predictions.to_tensor(default_value=-100)
|
||||||
|
predictions = tf.math.argmax(predictions, axis=-1).numpy()
|
||||||
|
if "label" in eval_dataset:
|
||||||
|
labels = eval_dataset.with_format("tf")["label"]
|
||||||
|
else:
|
||||||
|
labels = eval_dataset.with_format("tf")["labels"]
|
||||||
|
if isinstance(labels, tf.RaggedTensor):
|
||||||
|
labels = labels.to_tensor(default_value=-100)
|
||||||
|
labels = labels.numpy()
|
||||||
|
attention_mask = eval_dataset.with_format("tf")["attention_mask"]
|
||||||
|
if isinstance(attention_mask, tf.RaggedTensor):
|
||||||
|
attention_mask = attention_mask.to_tensor(default_value=-100)
|
||||||
|
attention_mask = attention_mask.numpy()
|
||||||
|
labels[attention_mask == 0] = -100
|
||||||
preds, refs = get_labels(predictions, labels)
|
preds, refs = get_labels(predictions, labels)
|
||||||
metric.add_batch(
|
metric.add_batch(
|
||||||
predictions=preds,
|
predictions=preds,
|
||||||
@@ -550,12 +572,15 @@ def main():
|
|||||||
logger.info("Evaluation metrics:")
|
logger.info("Evaluation metrics:")
|
||||||
for key, val in eval_metric.items():
|
for key, val in eval_metric.items():
|
||||||
logger.info(f"{key}: {val:.4f}")
|
logger.info(f"{key}: {val:.4f}")
|
||||||
|
|
||||||
|
if training_args.output_dir is not None:
|
||||||
|
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||||
|
with open(output_eval_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(eval_metric))
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# We don't do predictions in the strategy scope because there are some issues in there right now.
|
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||||
# They'll get fixed eventually, promise!
|
# If we're not pushing to hub, at least save a local copy when we're done
|
||||||
|
|
||||||
if training_args.output_dir is not None:
|
|
||||||
model.save_pretrained(training_args.output_dir)
|
model.save_pretrained(training_args.output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -18,30 +18,32 @@ Fine-tuning the library models for translation.
|
|||||||
"""
|
"""
|
||||||
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
|
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from functools import partial
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
import evaluate
|
import evaluate
|
||||||
import transformers
|
import transformers
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
|
DataCollatorForSeq2Seq,
|
||||||
HfArgumentParser,
|
HfArgumentParser,
|
||||||
|
KerasMetricCallback,
|
||||||
M2M100Tokenizer,
|
M2M100Tokenizer,
|
||||||
MBart50Tokenizer,
|
MBart50Tokenizer,
|
||||||
MBart50TokenizerFast,
|
MBart50TokenizerFast,
|
||||||
MBartTokenizer,
|
MBartTokenizer,
|
||||||
MBartTokenizerFast,
|
MBartTokenizerFast,
|
||||||
|
PushToHubCallback,
|
||||||
TFAutoModelForSeq2SeqLM,
|
TFAutoModelForSeq2SeqLM,
|
||||||
TFTrainingArguments,
|
TFTrainingArguments,
|
||||||
create_optimizer,
|
create_optimizer,
|
||||||
@@ -224,6 +226,16 @@ class DataTrainingArguments:
|
|||||||
source_prefix: Optional[str] = field(
|
source_prefix: Optional[str] = field(
|
||||||
default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
|
default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
|
||||||
)
|
)
|
||||||
|
forced_bos_token: Optional[str] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={
|
||||||
|
"help": (
|
||||||
|
"The token to force as the first generated token after the :obj:`decoder_start_token_id`.Useful for"
|
||||||
|
" multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token needs to"
|
||||||
|
" be the target language token.(Usually it is the target language token)"
|
||||||
|
)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
||||||
@@ -239,70 +251,6 @@ class DataTrainingArguments:
|
|||||||
self.val_max_target_length = self.max_target_length
|
self.val_max_target_length = self.max_target_length
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
# region Data generator
|
|
||||||
def sample_generator(dataset, model, tokenizer, shuffle, pad_to_multiple_of=None):
|
|
||||||
if shuffle:
|
|
||||||
sample_ordering = np.random.permutation(len(dataset))
|
|
||||||
else:
|
|
||||||
sample_ordering = np.arange(len(dataset))
|
|
||||||
for sample_idx in sample_ordering:
|
|
||||||
example = dataset[int(sample_idx)]
|
|
||||||
# Handle dicts with proper padding and conversion to tensor.
|
|
||||||
example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
|
|
||||||
example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()}
|
|
||||||
if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"):
|
|
||||||
decoder_input_ids = model.prepare_decoder_input_ids_from_labels(
|
|
||||||
labels=tf.expand_dims(example["labels"], 0)
|
|
||||||
)
|
|
||||||
example["decoder_input_ids"] = tf.squeeze(decoder_input_ids, 0)
|
|
||||||
yield example, example["labels"] # TF needs some kind of labels, even if we don't use them
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
|
||||||
|
|
||||||
|
|
||||||
# region Helper functions
|
|
||||||
def dataset_to_tf(dataset, model, tokenizer, total_batch_size, num_epochs, shuffle):
|
|
||||||
if dataset is None:
|
|
||||||
return None
|
|
||||||
train_generator = partial(sample_generator, dataset, model, tokenizer, shuffle=shuffle)
|
|
||||||
train_signature = {
|
|
||||||
feature: tf.TensorSpec(shape=(None,), dtype=tf.int32)
|
|
||||||
for feature in dataset.features
|
|
||||||
if feature != "special_tokens_mask"
|
|
||||||
}
|
|
||||||
if (
|
|
||||||
model is not None
|
|
||||||
and "decoder_input_ids" not in train_signature
|
|
||||||
and hasattr(model, "prepare_decoder_input_ids_from_labels")
|
|
||||||
):
|
|
||||||
train_signature["decoder_input_ids"] = train_signature["labels"]
|
|
||||||
# This may need to be changed depending on your particular model or tokenizer!
|
|
||||||
padding_values = {
|
|
||||||
key: tf.convert_to_tensor(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0, dtype=tf.int32)
|
|
||||||
for key in train_signature.keys()
|
|
||||||
}
|
|
||||||
padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int32)
|
|
||||||
train_signature["labels"] = train_signature["input_ids"]
|
|
||||||
train_signature = (train_signature, train_signature["labels"])
|
|
||||||
options = tf.data.Options()
|
|
||||||
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
|
||||||
tf_dataset = (
|
|
||||||
tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
|
|
||||||
.with_options(options)
|
|
||||||
.padded_batch(
|
|
||||||
batch_size=total_batch_size,
|
|
||||||
drop_remainder=True,
|
|
||||||
padding_values=(padding_values, np.array(-100, dtype=np.int32)),
|
|
||||||
)
|
|
||||||
.repeat(int(num_epochs))
|
|
||||||
)
|
|
||||||
return tf_dataset
|
|
||||||
|
|
||||||
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
|
||||||
@@ -541,67 +489,149 @@ def main():
|
|||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Prepare TF Dataset objects
|
# region Prepare TF Dataset objects
|
||||||
|
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
||||||
|
data_collator = DataCollatorForSeq2Seq(
|
||||||
|
tokenizer,
|
||||||
|
model=model,
|
||||||
|
label_pad_token_id=label_pad_token_id,
|
||||||
|
pad_to_multiple_of=64, # Reduce the number of unique shapes for XLA, especially for generation
|
||||||
|
return_tensors="tf",
|
||||||
|
)
|
||||||
num_replicas = training_args.strategy.num_replicas_in_sync
|
num_replicas = training_args.strategy.num_replicas_in_sync
|
||||||
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
total_train_batch_size = training_args.per_device_train_batch_size * num_replicas
|
||||||
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
|
||||||
tf_train_dataset = dataset_to_tf(
|
|
||||||
|
dataset_options = tf.data.Options()
|
||||||
|
dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
|
||||||
|
|
||||||
|
# model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
|
||||||
|
# training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
|
||||||
|
# use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
|
||||||
|
# yourself if you use this method, whereas they are automatically inferred from the model input names when
|
||||||
|
# using model.prepare_tf_dataset()
|
||||||
|
# For more info see the docs:
|
||||||
|
# https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
|
||||||
|
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset
|
||||||
|
|
||||||
|
tf_train_dataset = model.prepare_tf_dataset(
|
||||||
train_dataset,
|
train_dataset,
|
||||||
model,
|
collate_fn=data_collator,
|
||||||
tokenizer,
|
batch_size=total_train_batch_size,
|
||||||
total_batch_size=total_train_batch_size,
|
|
||||||
num_epochs=training_args.num_train_epochs,
|
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
)
|
).with_options(dataset_options)
|
||||||
tf_eval_dataset = dataset_to_tf(
|
tf_eval_dataset = model.prepare_tf_dataset(
|
||||||
eval_dataset,
|
eval_dataset, collate_fn=data_collator, batch_size=total_eval_batch_size, shuffle=False
|
||||||
model,
|
).with_options(dataset_options)
|
||||||
tokenizer,
|
|
||||||
total_eval_batch_size,
|
|
||||||
num_epochs=1,
|
|
||||||
shuffle=False,
|
|
||||||
)
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Optimizer, loss and LR scheduling
|
# region Optimizer and LR scheduling
|
||||||
# Scheduler and math around the number of training steps.
|
num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs)
|
||||||
num_update_steps_per_epoch = len(train_dataset) // training_args.per_device_train_batch_size
|
if training_args.warmup_steps > 0:
|
||||||
num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch
|
num_warmup_steps = training_args.warmup_steps
|
||||||
optimizer, lr_schedule = create_optimizer(
|
elif training_args.warmup_ratio > 0:
|
||||||
init_lr=training_args.learning_rate,
|
num_warmup_steps = int(num_train_steps * training_args.warmup_ratio)
|
||||||
num_train_steps=num_train_steps,
|
else:
|
||||||
num_warmup_steps=training_args.warmup_steps,
|
num_warmup_steps = 0
|
||||||
)
|
if training_args.do_train:
|
||||||
|
optimizer, lr_schedule = create_optimizer(
|
||||||
def masked_sparse_categorical_crossentropy(y_true, y_pred):
|
init_lr=training_args.learning_rate,
|
||||||
# We clip the negative labels to 0 to avoid NaNs appearing in the output and
|
num_train_steps=num_train_steps,
|
||||||
# fouling up everything that comes afterwards. The loss values corresponding to clipped values
|
num_warmup_steps=num_warmup_steps,
|
||||||
# will be masked later anyway, but even masked NaNs seem to cause overflows for some reason.
|
adam_beta1=training_args.adam_beta1,
|
||||||
# 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely
|
adam_beta2=training_args.adam_beta2,
|
||||||
# event that you have more than 1 million tokens in your vocabulary, consider increasing this value.
|
adam_epsilon=training_args.adam_epsilon,
|
||||||
# More pragmatically, consider redesigning your tokenizer.
|
weight_decay_rate=training_args.weight_decay,
|
||||||
losses = tf.keras.losses.sparse_categorical_crossentropy(
|
adam_global_clipnorm=training_args.max_grad_norm,
|
||||||
tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True
|
|
||||||
)
|
)
|
||||||
# Compute the per-sample loss only over the unmasked tokens
|
else:
|
||||||
losses = tf.ragged.boolean_mask(losses, y_true != -100)
|
optimizer = None
|
||||||
losses = tf.reduce_mean(losses, axis=-1)
|
|
||||||
return losses
|
|
||||||
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Metric and postprocessing
|
# region Metric and postprocessing
|
||||||
metric = evaluate.load("sacrebleu")
|
if training_args.do_eval:
|
||||||
|
metric = evaluate.load("sacrebleu")
|
||||||
|
|
||||||
def postprocess_text(preds, labels):
|
if data_args.val_max_target_length is None:
|
||||||
preds = [pred.strip() for pred in preds]
|
data_args.val_max_target_length = data_args.max_target_length
|
||||||
labels = [[label.strip()] for label in labels]
|
|
||||||
|
|
||||||
return preds, labels
|
gen_kwargs = {
|
||||||
|
"max_length": data_args.val_max_target_length,
|
||||||
|
"num_beams": data_args.num_beams,
|
||||||
|
"no_repeat_ngram_size": 0, # Not supported under XLA right now, and some models set it by default
|
||||||
|
}
|
||||||
|
|
||||||
|
def postprocess_text(preds, labels):
|
||||||
|
preds = [pred.strip() for pred in preds]
|
||||||
|
labels = [[label.strip()] for label in labels]
|
||||||
|
|
||||||
|
return preds, labels
|
||||||
|
|
||||||
|
def compute_metrics(preds):
|
||||||
|
predictions, labels = preds
|
||||||
|
if isinstance(predictions, tuple):
|
||||||
|
predictions = predictions[0]
|
||||||
|
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
|
||||||
|
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||||
|
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||||
|
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
|
||||||
|
metrics = metric.compute(predictions=decoded_preds, references=decoded_labels)
|
||||||
|
return {"bleu": metrics["score"]}
|
||||||
|
|
||||||
|
# The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics
|
||||||
|
# to be computed each epoch. Any Python code can be included in the metric_fn. This is especially
|
||||||
|
# useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs.
|
||||||
|
# For more information, see the docs at
|
||||||
|
# https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback
|
||||||
|
|
||||||
|
metric_callback = KerasMetricCallback(
|
||||||
|
metric_fn=compute_metrics,
|
||||||
|
eval_dataset=tf_eval_dataset,
|
||||||
|
predict_with_generate=True,
|
||||||
|
use_xla_generation=True,
|
||||||
|
generate_kwargs=gen_kwargs,
|
||||||
|
)
|
||||||
|
callbacks = [metric_callback]
|
||||||
|
else:
|
||||||
|
callbacks = []
|
||||||
|
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
# region Preparing push_to_hub and model card
|
||||||
|
push_to_hub_model_id = training_args.push_to_hub_model_id
|
||||||
|
model_name = model_args.model_name_or_path.split("/")[-1]
|
||||||
|
if not push_to_hub_model_id:
|
||||||
|
push_to_hub_model_id = f"{model_name}-finetuned-{data_args.source_lang}-{data_args.target_lang}"
|
||||||
|
|
||||||
|
model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "translation"}
|
||||||
|
if data_args.dataset_name is not None:
|
||||||
|
model_card_kwargs["dataset_tags"] = data_args.dataset_name
|
||||||
|
if data_args.dataset_config_name is not None:
|
||||||
|
model_card_kwargs["dataset_args"] = data_args.dataset_config_name
|
||||||
|
model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
|
||||||
|
else:
|
||||||
|
model_card_kwargs["dataset"] = data_args.dataset_name
|
||||||
|
|
||||||
|
languages = [l for l in [data_args.source_lang, data_args.target_lang] if l is not None]
|
||||||
|
if len(languages) > 0:
|
||||||
|
model_card_kwargs["language"] = languages
|
||||||
|
|
||||||
|
if training_args.push_to_hub:
|
||||||
|
# Because this training can be quite long, we save once per epoch.
|
||||||
|
callbacks.append(
|
||||||
|
PushToHubCallback(
|
||||||
|
output_dir=training_args.output_dir,
|
||||||
|
model_id=push_to_hub_model_id,
|
||||||
|
organization=training_args.push_to_hub_organization,
|
||||||
|
token=training_args.push_to_hub_token,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
**model_card_kwargs,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# endregion
|
||||||
|
|
||||||
# region Training
|
# region Training
|
||||||
model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer)
|
eval_metrics = None
|
||||||
|
model.compile(optimizer=optimizer, jit_compile=training_args.xla)
|
||||||
|
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
logger.info("***** Running training *****")
|
logger.info("***** Running training *****")
|
||||||
@@ -611,41 +641,48 @@ def main():
|
|||||||
logger.info(f" Total train batch size = {total_train_batch_size}")
|
logger.info(f" Total train batch size = {total_train_batch_size}")
|
||||||
logger.info(f" Total optimization steps = {num_train_steps}")
|
logger.info(f" Total optimization steps = {num_train_steps}")
|
||||||
|
|
||||||
model.fit(
|
if training_args.xla and not data_args.pad_to_max_length:
|
||||||
tf_train_dataset,
|
logger.warning(
|
||||||
epochs=int(training_args.num_train_epochs),
|
"XLA training may be slow at first when --pad_to_max_length is not set "
|
||||||
steps_per_epoch=num_update_steps_per_epoch,
|
"until all possible shapes have been compiled."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks)
|
||||||
|
eval_metrics = {key: val[-1] for key, val in history.history.items()}
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
# region Validation
|
# region Validation
|
||||||
if data_args.val_max_target_length is None:
|
if training_args.do_eval and not training_args.do_train:
|
||||||
data_args.val_max_target_length = data_args.max_target_length
|
# Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate
|
||||||
|
@tf.function(jit_compile=True)
|
||||||
|
def generate(**kwargs):
|
||||||
|
return model.generate(**kwargs)
|
||||||
|
|
||||||
gen_kwargs = {
|
if training_args.do_eval:
|
||||||
"max_length": data_args.val_max_target_length,
|
logger.info("Evaluation...")
|
||||||
"num_beams": data_args.num_beams,
|
for batch, labels in tf_eval_dataset:
|
||||||
}
|
batch.update(gen_kwargs)
|
||||||
if training_args.do_eval:
|
generated_tokens = generate(**batch)
|
||||||
logger.info("Evaluation...")
|
if isinstance(generated_tokens, tuple):
|
||||||
for batch, labels in tqdm(
|
generated_tokens = generated_tokens[0]
|
||||||
tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size
|
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
||||||
):
|
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||||
batch.update(gen_kwargs)
|
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||||
generated_tokens = model.generate(**batch)
|
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
|
||||||
if isinstance(generated_tokens, tuple):
|
|
||||||
generated_tokens = generated_tokens[0]
|
|
||||||
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
|
||||||
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
|
||||||
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
|
||||||
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
|
|
||||||
|
|
||||||
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
|
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
|
||||||
eval_metric = metric.compute()
|
|
||||||
logger.info({"bleu": eval_metric["score"]})
|
eval_metrics = metric.compute()
|
||||||
|
logger.info({"bleu": eval_metrics["score"]})
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
if training_args.output_dir is not None:
|
if training_args.output_dir is not None and eval_metrics is not None:
|
||||||
|
output_eval_file = os.path.join(training_args.output_dir, "all_results.json")
|
||||||
|
with open(output_eval_file, "w") as writer:
|
||||||
|
writer.write(json.dumps(eval_metrics))
|
||||||
|
|
||||||
|
if training_args.output_dir is not None and not training_args.push_to_hub:
|
||||||
|
# If we're not pushing to hub, at least save a local copy when we're done
|
||||||
model.save_pretrained(training_args.output_dir)
|
model.save_pretrained(training_args.output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -87,6 +87,8 @@ def create_optimizer(
|
|||||||
adam_beta1: float = 0.9,
|
adam_beta1: float = 0.9,
|
||||||
adam_beta2: float = 0.999,
|
adam_beta2: float = 0.999,
|
||||||
adam_epsilon: float = 1e-8,
|
adam_epsilon: float = 1e-8,
|
||||||
|
adam_clipnorm: Optional[float] = None,
|
||||||
|
adam_global_clipnorm: Optional[float] = None,
|
||||||
weight_decay_rate: float = 0.0,
|
weight_decay_rate: float = 0.0,
|
||||||
power: float = 1.0,
|
power: float = 1.0,
|
||||||
include_in_weight_decay: Optional[List[str]] = None,
|
include_in_weight_decay: Optional[List[str]] = None,
|
||||||
@@ -109,6 +111,11 @@ def create_optimizer(
|
|||||||
The beta2 to use in Adam.
|
The beta2 to use in Adam.
|
||||||
adam_epsilon (`float`, *optional*, defaults to 1e-8):
|
adam_epsilon (`float`, *optional*, defaults to 1e-8):
|
||||||
The epsilon to use in Adam.
|
The epsilon to use in Adam.
|
||||||
|
adam_clipnorm: (`float`, *optional*, defaults to `None`):
|
||||||
|
If not `None`, clip the gradient norm for each weight tensor to this value.
|
||||||
|
adam_global_clipnorm: (`float`, *optional*, defaults to `None`)
|
||||||
|
If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
|
||||||
|
weight tensors, as if they were concatenated into a single vector.
|
||||||
weight_decay_rate (`float`, *optional*, defaults to 0):
|
weight_decay_rate (`float`, *optional*, defaults to 0):
|
||||||
The weight decay to use.
|
The weight decay to use.
|
||||||
power (`float`, *optional*, defaults to 1.0):
|
power (`float`, *optional*, defaults to 1.0):
|
||||||
@@ -137,12 +144,19 @@ def create_optimizer(
|
|||||||
beta_1=adam_beta1,
|
beta_1=adam_beta1,
|
||||||
beta_2=adam_beta2,
|
beta_2=adam_beta2,
|
||||||
epsilon=adam_epsilon,
|
epsilon=adam_epsilon,
|
||||||
|
clipnorm=adam_clipnorm,
|
||||||
|
global_clipnorm=adam_global_clipnorm,
|
||||||
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
|
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
|
||||||
include_in_weight_decay=include_in_weight_decay,
|
include_in_weight_decay=include_in_weight_decay,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
optimizer = tf.keras.optimizers.Adam(
|
optimizer = tf.keras.optimizers.Adam(
|
||||||
learning_rate=lr_schedule, beta_1=adam_beta1, beta_2=adam_beta2, epsilon=adam_epsilon
|
learning_rate=lr_schedule,
|
||||||
|
beta_1=adam_beta1,
|
||||||
|
beta_2=adam_beta2,
|
||||||
|
epsilon=adam_epsilon,
|
||||||
|
clipnorm=adam_clipnorm,
|
||||||
|
global_clipnorm=adam_global_clipnorm,
|
||||||
)
|
)
|
||||||
# We return the optimizer and the LR scheduler in order to better track the
|
# We return the optimizer and the LR scheduler in order to better track the
|
||||||
# evolution of the LR independently of the optimizer.
|
# evolution of the LR independently of the optimizer.
|
||||||
|
|||||||
@@ -106,6 +106,7 @@ class OptimizerNames(ExplicitEnum):
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TrainingArguments:
|
class TrainingArguments:
|
||||||
|
framework = "pt"
|
||||||
"""
|
"""
|
||||||
TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
|
TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
|
||||||
itself**.
|
itself**.
|
||||||
@@ -1039,25 +1040,25 @@ class TrainingArguments:
|
|||||||
self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
|
self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
|
||||||
if self.run_name is None:
|
if self.run_name is None:
|
||||||
self.run_name = self.output_dir
|
self.run_name = self.output_dir
|
||||||
|
if self.framework == "pt" and is_torch_available():
|
||||||
if self.fp16_backend and self.fp16_backend != "auto":
|
if self.fp16_backend and self.fp16_backend != "auto":
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"`fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
|
"`fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
|
||||||
" `half_precision_backend` instead",
|
" `half_precision_backend` instead",
|
||||||
FutureWarning,
|
FutureWarning,
|
||||||
)
|
|
||||||
self.half_precision_backend = self.fp16_backend
|
|
||||||
|
|
||||||
if self.bf16 or self.bf16_full_eval:
|
|
||||||
|
|
||||||
if self.no_cuda and not is_torch_bf16_cpu_available():
|
|
||||||
# cpu
|
|
||||||
raise ValueError("Your setup doesn't support bf16/cpu. You need torch>=1.10")
|
|
||||||
elif not self.no_cuda and not is_torch_bf16_gpu_available():
|
|
||||||
# gpu
|
|
||||||
raise ValueError(
|
|
||||||
"Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
|
|
||||||
)
|
)
|
||||||
|
self.half_precision_backend = self.fp16_backend
|
||||||
|
|
||||||
|
if self.bf16 or self.bf16_full_eval:
|
||||||
|
|
||||||
|
if self.no_cuda and not is_torch_bf16_cpu_available():
|
||||||
|
# cpu
|
||||||
|
raise ValueError("Your setup doesn't support bf16/cpu. You need torch>=1.10")
|
||||||
|
elif not self.no_cuda and not is_torch_bf16_gpu_available():
|
||||||
|
# gpu
|
||||||
|
raise ValueError(
|
||||||
|
"Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0"
|
||||||
|
)
|
||||||
|
|
||||||
if self.fp16 and self.bf16:
|
if self.fp16 and self.bf16:
|
||||||
raise ValueError("At most one of fp16 and bf16 can be True, but not both")
|
raise ValueError("At most one of fp16 and bf16 can be True, but not both")
|
||||||
@@ -1084,7 +1085,8 @@ class TrainingArguments:
|
|||||||
self.optim = OptimizerNames.ADAFACTOR
|
self.optim = OptimizerNames.ADAFACTOR
|
||||||
|
|
||||||
if (
|
if (
|
||||||
is_torch_available()
|
self.framework == "pt"
|
||||||
|
and is_torch_available()
|
||||||
and (self.device.type != "cuda")
|
and (self.device.type != "cuda")
|
||||||
and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ)
|
and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ)
|
||||||
and (self.fp16 or self.fp16_full_eval)
|
and (self.fp16 or self.fp16_full_eval)
|
||||||
@@ -1095,7 +1097,8 @@ class TrainingArguments:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
is_torch_available()
|
self.framework == "pt"
|
||||||
|
and is_torch_available()
|
||||||
and (self.device.type != "cuda")
|
and (self.device.type != "cuda")
|
||||||
and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ)
|
and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ)
|
||||||
and (self.device.type != "cpu")
|
and (self.device.type != "cpu")
|
||||||
@@ -1106,7 +1109,7 @@ class TrainingArguments:
|
|||||||
" (`--bf16_full_eval`) can only be used on CUDA or CPU devices."
|
" (`--bf16_full_eval`) can only be used on CUDA or CPU devices."
|
||||||
)
|
)
|
||||||
|
|
||||||
if is_torch_available() and self.tf32 is not None:
|
if self.framework == "pt" and is_torch_available() and self.tf32 is not None:
|
||||||
if self.tf32:
|
if self.tf32:
|
||||||
if is_torch_tf32_available():
|
if is_torch_tf32_available():
|
||||||
torch.backends.cuda.matmul.allow_tf32 = True
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ if is_tf_available():
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TFTrainingArguments(TrainingArguments):
|
class TFTrainingArguments(TrainingArguments):
|
||||||
|
framework = "tf"
|
||||||
"""
|
"""
|
||||||
TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
|
TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
|
||||||
itself**.
|
itself**.
|
||||||
@@ -188,9 +189,6 @@ class TFTrainingArguments(TrainingArguments):
|
|||||||
def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", int]:
|
def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", int]:
|
||||||
logger.info("Tensorflow: setting up strategy")
|
logger.info("Tensorflow: setting up strategy")
|
||||||
|
|
||||||
if self.xla:
|
|
||||||
tf.config.optimizer.set_jit(True)
|
|
||||||
|
|
||||||
gpus = tf.config.list_physical_devices("GPU")
|
gpus = tf.config.list_physical_devices("GPU")
|
||||||
|
|
||||||
# Set to float16 at first
|
# Set to float16 at first
|
||||||
|
|||||||
Reference in New Issue
Block a user