From 6eb51450fa2a440a45e02b29f01e4f2aa4f70a4d Mon Sep 17 00:00:00 2001 From: Matt Date: Wed, 10 Aug 2022 11:49:51 -0400 Subject: [PATCH] TF Examples Rewrite (#18451) * Finished QA example * Dodge a merge conflict * Update text classification and LM examples * Update NER example * New Keras metrics WIP, fix NER example * Update NER example * Update MC, summarization and translation examples * Add XLA warnings when shapes are variable * Make sure batch_size is consistently scaled by num_replicas * Add PushToHubCallback to all models * Add docs links for KerasMetricCallback * Add docs links for prepare_tf_dataset and jit_compile * Correct inferred model names * Don't assume the dataset has 'lang' * Don't assume the dataset has 'lang' * Write metrics in text classification * Add 'framework' to TrainingArguments and TFTrainingArguments * Export metrics in all examples and add tests * Fix training args for Flax * Update command line args for translation test * make fixup * Fix accidentally running other tests in fp16 * Remove do_train/do_eval from run_clm.py * Remove do_train/do_eval from run_mlm.py * Add tensorflow tests to circleci * Fix circleci * Update examples/tensorflow/language-modeling/run_mlm.py Co-authored-by: Joao Gante * Update examples/tensorflow/test_tensorflow_examples.py Co-authored-by: Joao Gante * Update examples/tensorflow/translation/run_translation.py Co-authored-by: Joao Gante * Update examples/tensorflow/token-classification/run_ner.py Co-authored-by: Joao Gante * Fix save path for tests * Fix some model card kwargs * Explain the magical -1000 * Actually enable tests this time * Skip text classification PR until we fix shape inference * make fixup Co-authored-by: Joao Gante --- .circleci/config.yml | 67 ++++ examples/tensorflow/_tests_requirements.txt | 25 ++ .../tensorflow/language-modeling/run_clm.py | 159 ++++++--- .../tensorflow/language-modeling/run_mlm.py | 140 +++++--- .../tensorflow/multiple-choice/run_swag.py | 132 +++++--- .../tensorflow/question-answering/run_qa.py | 187 ++++++++--- .../summarization/run_summarization.py | 278 +++++++++------- .../tensorflow/test_tensorflow_examples.py | 295 +++++++++++++++++ .../text-classification/run_glue.py | 135 +++++--- .../run_text_classification.py | 141 +++++--- .../token-classification/run_ner.py | 213 ++++++------ .../tensorflow/translation/run_translation.py | 313 ++++++++++-------- src/transformers/optimization_tf.py | 16 +- src/transformers/training_args.py | 45 +-- src/transformers/training_args_tf.py | 4 +- 15 files changed, 1490 insertions(+), 660 deletions(-) create mode 100644 examples/tensorflow/_tests_requirements.txt create mode 100644 examples/tensorflow/test_tensorflow_examples.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 83ee65248e..666505ab3b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -658,6 +658,71 @@ jobs: - store_artifacts: path: ~/transformers/reports + run_examples_tensorflow: + working_directory: ~/transformers + docker: + - image: cimg/python:3.7.12 + environment: + OMP_NUM_THREADS: 1 + TRANSFORMERS_IS_CI: yes + PYTEST_TIMEOUT: 120 + resource_class: xlarge + parallelism: 1 + steps: + - checkout + - restore_cache: + keys: + - v0.5-tensorflow_examples-{{ checksum "setup.py" }} + - v0.5-{{ checksum "setup.py" }} + - run: pip install --upgrade pip + - run: pip install .[sklearn,tensorflow,sentencepiece,testing] + - run: pip install -r examples/tensorflow/_tests_requirements.txt + - save_cache: + key: v0.5-tensorflow_examples-{{ checksum "setup.py" }} + paths: + - '~/.cache/pip' + - run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt + - store_artifacts: + path: ~/transformers/test_preparation.txt + - run: | + if [ -f test_list.txt ]; then + python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee tests_output.txt + fi + - store_artifacts: + path: ~/transformers/tensorflow_examples_output.txt + - store_artifacts: + path: ~/transformers/reports + + run_examples_tensorflow_all: + working_directory: ~/transformers + docker: + - image: cimg/python:3.7.12 + environment: + OMP_NUM_THREADS: 1 + TRANSFORMERS_IS_CI: yes + PYTEST_TIMEOUT: 120 + resource_class: xlarge + parallelism: 1 + steps: + - checkout + - restore_cache: + keys: + - v0.5-tensorflow_examples-{{ checksum "setup.py" }} + - v0.5-{{ checksum "setup.py" }} + - run: pip install --upgrade pip + - run: pip install .[sklearn,tensorflow,sentencepiece,testing] + - run: pip install -r examples/tensorflow/_tests_requirements.txt + - save_cache: + key: v0.5-tensorflow_examples-{{ checksum "setup.py" }} + paths: + - '~/.cache/pip' + - run: | + TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --max-worker-restart=0 --dist=loadfile -s --make-reports=examples_tensorflow ./examples/tensorflow/ | tee examples_output.txt + - store_artifacts: + path: ~/transformers/tensorflow_examples_output.txt + - store_artifacts: + path: ~/transformers/reports + run_examples_flax: working_directory: ~/transformers docker: @@ -1000,6 +1065,7 @@ workflows: - check_code_quality - check_repository_consistency - run_examples_torch + - run_examples_tensorflow - run_examples_flax - run_tests_custom_tokenizers - run_tests_torch_and_tf @@ -1022,6 +1088,7 @@ workflows: - main jobs: - run_examples_torch_all + - run_examples_tensorflow_all - run_examples_flax_all - run_tests_torch_and_tf_all - run_tests_torch_and_flax_all diff --git a/examples/tensorflow/_tests_requirements.txt b/examples/tensorflow/_tests_requirements.txt new file mode 100644 index 0000000000..37e37e3525 --- /dev/null +++ b/examples/tensorflow/_tests_requirements.txt @@ -0,0 +1,25 @@ +tensorflow +tensorboard +scikit-learn +seqeval +psutil +sacrebleu >= 1.4.12 +git+https://github.com/huggingface/accelerate@main#egg=accelerate +rouge-score +tensorflow_datasets +matplotlib +git-python==1.0.3 +faiss-cpu +streamlit +elasticsearch +nltk +pandas +datasets >= 1.13.3 +fire +pytest +conllu +sentencepiece != 0.1.92 +protobuf +jiwer +librosa +evaluate >= 0.2.0 diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py index 3f12683d10..cbe2f54f22 100755 --- a/examples/tensorflow/language-modeling/run_clm.py +++ b/examples/tensorflow/language-modeling/run_clm.py @@ -22,6 +22,8 @@ https://huggingface.co/models?filter=text-generation """ # You can also adapt this script on your own clm task. Pointers for this are left as comments. +import json + # region Imports import logging import math @@ -46,8 +48,8 @@ from transformers import ( TF_MODEL_FOR_CAUSAL_LM_MAPPING, AutoConfig, AutoTokenizer, - DefaultDataCollator, HfArgumentParser, + PushToHubCallback, TFAutoModelForCausalLM, TFTrainingArguments, create_optimizer, @@ -205,21 +207,6 @@ class DataTrainingArguments: assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." -# endregion - -# region Helper classes -class SavePretrainedCallback(tf.keras.callbacks.Callback): - # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary - # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback - # that saves the model with this method after each epoch. - def __init__(self, output_dir, **kwargs): - super().__init__() - self.output_dir = output_dir - - def on_epoch_end(self, epoch, logs=None): - self.model.save_pretrained(self.output_dir) - - # endregion @@ -299,6 +286,7 @@ def main(): raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, + cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in raw_datasets.keys(): @@ -306,12 +294,14 @@ def main(): data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: @@ -321,16 +311,39 @@ def main(): data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file - extension = data_args.train_file.split(".")[-1] + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = data_args.keep_linebreaks raw_datasets = load_dataset( extension, data_files=data_files, + cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + **dataset_args, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + use_auth_token=True if model_args.use_auth_token else None, + **dataset_args, + ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion @@ -446,7 +459,7 @@ def main(): eval_dataset = eval_dataset.select(range(max_eval_samples)) # Log a few random samples from the training set: - for index in random.sample(range(len(train_dataset)), 3): + for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # endregion @@ -465,44 +478,88 @@ def main(): # region TF Dataset preparation num_replicas = training_args.strategy.num_replicas_in_sync - data_collator = DefaultDataCollator(return_tensors="tf") options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF - tf_train_dataset = train_dataset.to_tf_dataset( - # labels are passed as input, as we will use the model's internal loss - columns=[col for col in train_dataset.features if col != "special_tokens_mask"], + # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in + # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also + # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names + # yourself if you use this method, whereas they are automatically inferred from the model input names when + # using model.prepare_tf_dataset() + # For more info see the docs: + # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset + # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset + + tf_train_dataset = model.prepare_tf_dataset( + train_dataset, shuffle=True, batch_size=num_replicas * training_args.per_device_train_batch_size, - collate_fn=data_collator, - drop_remainder=True, ).with_options(options) - tf_eval_dataset = eval_dataset.to_tf_dataset( - # labels are passed as input, as we will use the model's internal loss - columns=[col for col in eval_dataset.features if col != "special_tokens_mask"], + tf_eval_dataset = model.prepare_tf_dataset( + eval_dataset, shuffle=False, - batch_size=num_replicas * training_args.per_device_train_batch_size, - collate_fn=data_collator, + batch_size=num_replicas * training_args.per_device_eval_batch_size, drop_remainder=True, ).with_options(options) # endregion # region Optimizer and loss - batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size) + num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs) + if training_args.warmup_steps > 0: + num_warmup_steps = training_args.warmup_steps + elif training_args.warmup_ratio > 0: + num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) + else: + num_warmup_steps = 0 + # Bias and layernorm weights are automatically excluded from the decay optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, - num_train_steps=int(training_args.num_train_epochs * batches_per_epoch), - num_warmup_steps=training_args.warmup_steps, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, + adam_global_clipnorm=training_args.max_grad_norm, ) # no user-specified loss = will use the model internal loss - model.compile(optimizer=optimizer) + model.compile(optimizer=optimizer, jit_compile=training_args.xla) + # endregion + + # region Preparing push_to_hub and model card + push_to_hub_model_id = training_args.push_to_hub_model_id + model_name = model_args.model_name_or_path.split("/")[-1] + if not push_to_hub_model_id: + if data_args.dataset_name is not None: + push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}" + else: + push_to_hub_model_id = f"{model_name}-finetuned-clm" + + model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + model_card_kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + model_card_kwargs["dataset_args"] = data_args.dataset_config_name + model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + model_card_kwargs["dataset"] = data_args.dataset_name + + if training_args.push_to_hub: + callbacks = [ + PushToHubCallback( + output_dir=training_args.output_dir, + model_id=push_to_hub_model_id, + organization=training_args.push_to_hub_organization, + token=training_args.push_to_hub_token, + tokenizer=tokenizer, + **model_card_kwargs, + ) + ] + else: + callbacks = [] # endregion # region Training and validation @@ -512,33 +569,45 @@ def main(): logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}") logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}") + # For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints + # to the Hugging Face Hub rather than just pushing the finished model. + # See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback + history = model.fit( tf_train_dataset, validation_data=tf_eval_dataset, epochs=int(training_args.num_train_epochs), - steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas), - callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)], + callbacks=callbacks, ) + train_loss = history.history["loss"][-1] try: - train_perplexity = math.exp(history.history["loss"][-1]) + train_perplexity = math.exp(train_loss) except OverflowError: train_perplexity = math.inf + logger.info(f" Final train loss: {train_loss:.3f}") + logger.info(f" Final train perplexity: {train_perplexity:.3f}") + validation_loss = history.history["val_loss"][-1] try: - validation_perplexity = math.exp(history.history["val_loss"][-1]) + validation_perplexity = math.exp(validation_loss) except OverflowError: validation_perplexity = math.inf - logger.info(f" Final train loss: {history.history['loss'][-1]:.3f}") - logger.info(f" Final train perplexity: {train_perplexity:.3f}") - logger.info(f" Final validation loss: {history.history['val_loss'][-1]:.3f}") + logger.info(f" Final validation loss: {validation_loss:.3f}") logger.info(f" Final validation perplexity: {validation_perplexity:.3f}") - # endregion if training_args.output_dir is not None: - model.save_pretrained(training_args.output_dir) + output_eval_file = os.path.join(training_args.output_dir, "all_results.json") + results_dict = dict() + results_dict["train_loss"] = train_loss + results_dict["train_perplexity"] = train_perplexity + results_dict["eval_loss"] = validation_loss + results_dict["eval_perplexity"] = validation_perplexity + with open(output_eval_file, "w") as writer: + writer.write(json.dumps(results_dict)) + # endregion - if training_args.push_to_hub: - # You'll probably want to include some of your own metadata here! - model.push_to_hub() + if training_args.output_dir is not None and not training_args.push_to_hub: + # If we're not pushing to hub, at least save a local copy when we're done + model.save_pretrained(training_args.output_dir) if __name__ == "__main__": diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py index b421ed8e66..43449a0934 100755 --- a/examples/tensorflow/language-modeling/run_mlm.py +++ b/examples/tensorflow/language-modeling/run_mlm.py @@ -22,9 +22,7 @@ https://huggingface.co/models?filter=fill-mask """ # You can also adapt this script on your own mlm task. Pointers for this are left as comments. -# TODO Do multi-GPU and TPU tests and make sure the dataset length works as expected -# TODO Duplicate all changes over to the CLM script - +import json import logging import math import os @@ -50,6 +48,7 @@ from transformers import ( AutoTokenizer, DataCollatorForLanguageModeling, HfArgumentParser, + PushToHubCallback, TFAutoModelForMaskedLM, TFTrainingArguments, create_optimizer, @@ -217,22 +216,6 @@ class DataTrainingArguments: # endregion -# region Helper classes -class SavePretrainedCallback(tf.keras.callbacks.Callback): - # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary - # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback - # that saves the model with this method after each epoch. - def __init__(self, output_dir, **kwargs): - super().__init__() - self.output_dir = output_dir - - def on_epoch_end(self, epoch, logs=None): - self.model.save_pretrained(self.output_dir) - - -# endregion - - def main(): # region Argument Parsing parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) @@ -492,7 +475,7 @@ def main(): eval_dataset = eval_dataset.select(range(max_eval_samples)) # Log a few random samples from the training set: - for index in random.sample(range(len(train_dataset)), 3): + for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # endregion @@ -517,40 +500,88 @@ def main(): options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF - tf_train_dataset = train_dataset.to_tf_dataset( - # labels are passed as input, as we will use the model's internal loss - columns=[col for col in train_dataset.features if col != "special_tokens_mask"] + ["labels"], + # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in + # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also + # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names + # yourself if you use this method, whereas they are automatically inferred from the model input names when + # using model.prepare_tf_dataset() + # For more info see the docs: + # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset + # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset + + tf_train_dataset = model.prepare_tf_dataset( + train_dataset, shuffle=True, batch_size=num_replicas * training_args.per_device_train_batch_size, collate_fn=data_collator, - drop_remainder=True, ).with_options(options) - tf_eval_dataset = eval_dataset.to_tf_dataset( + tf_eval_dataset = model.prepare_tf_dataset( + eval_dataset, # labels are passed as input, as we will use the model's internal loss - columns=[col for col in eval_dataset.features if col != "special_tokens_mask"] + ["labels"], shuffle=False, - batch_size=num_replicas * training_args.per_device_train_batch_size, + batch_size=num_replicas * training_args.per_device_eval_batch_size, collate_fn=data_collator, drop_remainder=True, ).with_options(options) # endregion # region Optimizer and loss - batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size) + num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs) + if training_args.warmup_steps > 0: + num_warmup_steps = training_args.warmup_steps + elif training_args.warmup_ratio > 0: + num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) + else: + num_warmup_steps = 0 + # Bias and layernorm weights are automatically excluded from the decay optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, - num_train_steps=int(training_args.num_train_epochs * batches_per_epoch), - num_warmup_steps=training_args.warmup_steps, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, + adam_global_clipnorm=training_args.max_grad_norm, ) # no user-specified loss = will use the model internal loss - model.compile(optimizer=optimizer) + model.compile(optimizer=optimizer, jit_compile=training_args.xla, run_eagerly=True) + # endregion + + # region Preparing push_to_hub and model card + push_to_hub_model_id = training_args.push_to_hub_model_id + model_name = model_args.model_name_or_path.split("/")[-1] + if not push_to_hub_model_id: + if data_args.dataset_name is not None: + push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}" + else: + push_to_hub_model_id = f"{model_name}-finetuned-mlm" + + model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"} + if data_args.dataset_name is not None: + model_card_kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + model_card_kwargs["dataset_args"] = data_args.dataset_config_name + model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + model_card_kwargs["dataset"] = data_args.dataset_name + + if training_args.push_to_hub: + callbacks = [ + PushToHubCallback( + output_dir=training_args.output_dir, + model_id=push_to_hub_model_id, + organization=training_args.push_to_hub_organization, + token=training_args.push_to_hub_token, + tokenizer=tokenizer, + **model_card_kwargs, + ) + ] + else: + callbacks = [] # endregion # region Training and validation @@ -560,33 +591,46 @@ def main(): logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}") logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}") + # For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints + # to the Hugging Face Hub rather than just pushing the finished model. + # See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback + history = model.fit( tf_train_dataset, validation_data=tf_eval_dataset, epochs=int(training_args.num_train_epochs), - steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas), - callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)], + callbacks=callbacks, ) + train_loss = history.history["loss"][-1] try: - train_perplexity = math.exp(history.history["loss"][-1]) + train_perplexity = math.exp(train_loss) except OverflowError: train_perplexity = math.inf - try: - validation_perplexity = math.exp(history.history["val_loss"][-1]) - except OverflowError: - validation_perplexity = math.inf - logger.warning(f" Final train loss: {history.history['loss'][-1]:.3f}") - logger.warning(f" Final train perplexity: {train_perplexity:.3f}") - logger.warning(f" Final validation loss: {history.history['val_loss'][-1]:.3f}") - logger.warning(f" Final validation perplexity: {validation_perplexity:.3f}") + logger.info(f" Final train loss: {train_loss:.3f}") + logger.info(f" Final train perplexity: {train_perplexity:.3f}") + + validation_loss = history.history["val_loss"][-1] + try: + validation_perplexity = math.exp(validation_loss) + except OverflowError: + validation_perplexity = math.inf + logger.info(f" Final validation loss: {validation_loss:.3f}") + logger.info(f" Final validation perplexity: {validation_perplexity:.3f}") + + if training_args.output_dir is not None: + output_eval_file = os.path.join(training_args.output_dir, "all_results.json") + results_dict = dict() + results_dict["train_loss"] = train_loss + results_dict["train_perplexity"] = train_perplexity + results_dict["eval_loss"] = validation_loss + results_dict["eval_perplexity"] = validation_perplexity + with open(output_eval_file, "w") as writer: + writer.write(json.dumps(results_dict)) # endregion - if training_args.output_dir is not None: - model.save_pretrained(training_args.output_dir) - - if training_args.push_to_hub: - # You'll probably want to append some of your own metadata here! - model.push_to_hub() + if training_args.output_dir is not None and not training_args.push_to_hub: + # If we're not pushing to hub, at least save a local copy when we're done + model.save_pretrained(training_args.output_dir) if __name__ == "__main__": diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index 6ba35bd0fd..2684500d24 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -18,6 +18,7 @@ Fine-tuning the library models for multiple choice. """ # You can also adapt this script on your own multiple choice task. Pointers for this are left as comments. +import json import logging import os import sys @@ -38,6 +39,7 @@ from transformers import ( AutoTokenizer, DefaultDataCollator, HfArgumentParser, + PushToHubCallback, TFAutoModelForMultipleChoice, TFTrainingArguments, create_optimizer, @@ -54,16 +56,6 @@ logger = logging.getLogger(__name__) # region Helper classes and functions -class SavePretrainedCallback(tf.keras.callbacks.Callback): - # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary - # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback - # that saves the model with this method after each epoch. - def __init__(self, output_dir, **kwargs): - super().__init__() - self.output_dir = output_dir - - def on_epoch_end(self, epoch, logs=None): - self.model.save_pretrained(self.output_dir) @dataclass @@ -391,7 +383,6 @@ def main(): if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] - non_label_columns = [feature for feature in train_dataset.features if feature not in ("label", "labels")] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) @@ -407,8 +398,6 @@ def main(): if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] - if not training_args.do_train: - non_label_columns = [feature for feature in eval_dataset.features if feature not in ("label", "labels")] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) @@ -444,79 +433,120 @@ def main(): num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas + if training_args.do_train: - total_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs) + num_train_steps = (len(train_dataset) // total_train_batch_size) * int(training_args.num_train_epochs) + if training_args.warmup_steps > 0: + num_warmup_steps = training_args.warmup_steps + elif training_args.warmup_ratio > 0: + num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) + else: + num_warmup_steps = 0 optimizer, lr_schedule = create_optimizer( - init_lr=training_args.learning_rate, num_train_steps=int(total_train_steps), num_warmup_steps=0 + init_lr=training_args.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + adam_beta1=training_args.adam_beta1, + adam_beta2=training_args.adam_beta2, + adam_epsilon=training_args.adam_epsilon, + weight_decay_rate=training_args.weight_decay, + adam_global_clipnorm=training_args.max_grad_norm, ) else: - optimizer = "adam" # Just put anything in here, since we're not using it anyway - model.compile( - optimizer=optimizer, - loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), - metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")], - ) + optimizer = None + model.compile(optimizer=optimizer, metrics=["accuracy"], jit_compile=training_args.xla) + # endregion + + # region Preparing push_to_hub and model card + push_to_hub_model_id = training_args.push_to_hub_model_id + model_name = model_args.model_name_or_path.split("/")[-1] + if not push_to_hub_model_id: + push_to_hub_model_id = f"{model_name}-finetuned-multiplechoice" + + model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "multiple-choice"} + + if training_args.push_to_hub: + callbacks = [ + PushToHubCallback( + output_dir=training_args.output_dir, + model_id=push_to_hub_model_id, + organization=training_args.push_to_hub_organization, + token=training_args.push_to_hub_token, + tokenizer=tokenizer, + **model_card_kwargs, + ) + ] + else: + callbacks = [] # endregion # region Training + eval_metrics = None if training_args.do_train: - dataset_exclude_cols = set(non_label_columns + ["label"]) - tf_train_dataset = train_dataset.to_tf_dataset( - columns=[col for col in train_dataset.column_names if col not in dataset_exclude_cols], + dataset_options = tf.data.Options() + dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF + + # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in + # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also + # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names + # yourself if you use this method, whereas they are automatically inferred from the model input names when + # using model.prepare_tf_dataset() + # For more info see the docs: + # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset + # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset + + tf_train_dataset = model.prepare_tf_dataset( + train_dataset, shuffle=True, batch_size=total_train_batch_size, collate_fn=data_collator, - drop_remainder=True, - # `label_cols` is needed for user-defined losses, such as in this example - label_cols="label" if "label" in train_dataset.column_names else None, - ) + ).with_options(dataset_options) if training_args.do_eval: - validation_data = eval_dataset.to_tf_dataset( - columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols], + validation_data = model.prepare_tf_dataset( + eval_dataset, shuffle=False, batch_size=total_eval_batch_size, collate_fn=data_collator, drop_remainder=True, - # `label_cols` is needed for user-defined losses, such as in this example - label_cols="label" if "label" in eval_dataset.column_names else None, - ) + ).with_options(dataset_options) else: validation_data = None - model.fit( + history = model.fit( tf_train_dataset, validation_data=validation_data, epochs=int(training_args.num_train_epochs), - callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)], + callbacks=callbacks, ) + eval_metrics = {key: val[-1] for key, val in history.history.items()} # endregion # region Evaluation if training_args.do_eval and not training_args.do_train: - dataset_exclude_cols = set(non_label_columns + ["label"]) + dataset_options = tf.data.Options() + dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF # Do a standalone evaluation pass - tf_eval_dataset = eval_dataset.to_tf_dataset( - columns=[col for col in eval_dataset.column_names if col not in dataset_exclude_cols], + tf_eval_dataset = model.prepare_tf_dataset( + eval_dataset, shuffle=False, batch_size=total_eval_batch_size, collate_fn=data_collator, drop_remainder=True, - # `label_cols` is needed for user-defined losses, such as in this example - label_cols="label" if "label" in eval_dataset.column_names else None, - ) - model.evaluate(tf_eval_dataset) + ).with_options(dataset_options) + eval_results = model.evaluate(tf_eval_dataset) + eval_metrics = {"val_loss": eval_results[0], "val_accuracy": eval_results[1]} # endregion + if eval_metrics is not None and training_args.output_dir is not None: + output_eval_file = os.path.join(training_args.output_dir, "all_results.json") + with open(output_eval_file, "w") as writer: + writer.write(json.dumps(eval_metrics)) + # region Push to hub - if training_args.push_to_hub: - model.push_to_hub( - finetuned_from=model_args.model_name_or_path, - tasks="multiple-choice", - dataset_tags="swag", - dataset_args="regular", - dataset="SWAG", - language="en", - ) + + if training_args.output_dir is not None and not training_args.push_to_hub: + # If we're not pushing to hub, at least save a local copy when we're done + model.save_pretrained(training_args.output_dir) # endregion diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index 91293aefb3..7f53a98415 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -18,6 +18,7 @@ Fine-tuning the library models for question answering. """ # You can also adapt this script on your own question answering task. Pointers for this are left as comments. +import json import logging import os import sys @@ -33,13 +34,13 @@ import transformers from transformers import ( AutoConfig, AutoTokenizer, - DataCollatorWithPadding, - DefaultDataCollator, EvalPrediction, HfArgumentParser, PreTrainedTokenizerFast, + PushToHubCallback, TFAutoModelForQuestionAnswering, TFTrainingArguments, + create_optimizer, set_seed, ) from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry @@ -609,7 +610,12 @@ def main(): # endregion with training_args.strategy.scope(): - # region Load model + + dataset_options = tf.data.Options() + dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF + num_replicas = training_args.strategy.num_replicas_in_sync + + # region Load model and prepare datasets if checkpoint is None: model_path = model_args.model_name_or_path else: @@ -621,71 +627,163 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) - optimizer = tf.keras.optimizers.Adam( - learning_rate=training_args.learning_rate, - beta_1=training_args.adam_beta1, - beta_2=training_args.adam_beta2, - epsilon=training_args.adam_epsilon, - clipnorm=training_args.max_grad_norm, - ) + if training_args.do_train: + + training_dataset = model.prepare_tf_dataset( + processed_datasets["train"], + shuffle=True, + batch_size=training_args.per_device_train_batch_size * num_replicas, + tokenizer=tokenizer, + ) + + training_dataset = training_dataset.with_options(dataset_options) + + num_train_steps = len(training_dataset) * training_args.num_train_epochs + if training_args.warmup_steps > 0: + num_warmup_steps = training_args.warmup_steps + elif training_args.warmup_ratio > 0: + num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) + else: + num_warmup_steps = 0 + + optimizer, schedule = create_optimizer( + init_lr=training_args.learning_rate, + num_train_steps=len(training_dataset) * training_args.num_train_epochs, + num_warmup_steps=num_warmup_steps, + adam_beta1=training_args.adam_beta1, + adam_beta2=training_args.adam_beta2, + adam_epsilon=training_args.adam_epsilon, + weight_decay_rate=training_args.weight_decay, + adam_global_clipnorm=training_args.max_grad_norm, + ) + + # no user-specified loss = will use the model internal loss + model.compile(optimizer=optimizer, jit_compile=training_args.xla, metrics=["accuracy"]) + + else: + model.compile(optimizer=None, jit_compile=training_args.xla, metrics=["accuracy"]) + training_dataset = None + + if training_args.do_eval: + eval_dataset = model.prepare_tf_dataset( + processed_datasets["validation"], + shuffle=False, + batch_size=training_args.per_device_train_batch_size * num_replicas, + tokenizer=tokenizer, + ) + eval_dataset = eval_dataset.with_options(dataset_options) + else: + eval_dataset = None + + if training_args.do_predict: + predict_dataset = model.prepare_tf_dataset( + processed_datasets["test"], + shuffle=False, + batch_size=training_args.per_device_eval_batch_size * num_replicas, + tokenizer=tokenizer, + ) + predict_dataset = predict_dataset.with_options(dataset_options) + else: + predict_dataset = None - # no user-specified loss = will use the model internal loss - model.compile(optimizer=optimizer) # endregion - # region Training - if padding: - data_collator = DefaultDataCollator(return_tensors="tf") + # region Preparing push_to_hub and model card + push_to_hub_model_id = training_args.push_to_hub_model_id + model_name = model_args.model_name_or_path.split("/")[-1] + if not push_to_hub_model_id: + if data_args.dataset_name is not None: + push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}" + else: + push_to_hub_model_id = f"{model_name}-finetuned-question-answering" + + model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"} + if data_args.dataset_name is not None: + model_card_kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + model_card_kwargs["dataset_args"] = data_args.dataset_config_name + model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + model_card_kwargs["dataset"] = data_args.dataset_name + + if training_args.push_to_hub: + callbacks = [ + PushToHubCallback( + output_dir=training_args.output_dir, + model_id=push_to_hub_model_id, + organization=training_args.push_to_hub_organization, + token=training_args.push_to_hub_token, + tokenizer=tokenizer, + **model_card_kwargs, + ) + ] else: - data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") - tensor_keys = ["attention_mask", "input_ids"] - label_keys = ["start_positions", "end_positions"] + callbacks = [] + # endregion + + # region Training and Evaluation if training_args.do_train: - # Make a tf.data.Dataset for this - training_dataset = processed_datasets["train"].to_tf_dataset( - # labels are passed as input, as we will use the model's internal loss - columns=tensor_keys + label_keys, - shuffle=True, - batch_size=training_args.per_device_train_batch_size, - collate_fn=data_collator, - drop_remainder=True, - ) - model.fit(training_dataset, epochs=int(training_args.num_train_epochs)) - # endregion + # Note that the validation and test datasets have been processed in a different way to the + # training datasets in this example, and so they don't have the same label structure. + # As such, we don't pass them directly to Keras, but instead get model predictions to evaluate + # after training. + model.fit(training_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks) - # region Evaluation if training_args.do_eval: logger.info("*** Evaluation ***") - eval_inputs = { - "input_ids": tf.ragged.constant(processed_datasets["validation"]["input_ids"]).to_tensor(), - "attention_mask": tf.ragged.constant(processed_datasets["validation"]["attention_mask"]).to_tensor(), - } - eval_predictions = model.predict(eval_inputs) + + # In this example, we compute advanced metrics at the end of training, but + # if you'd like to compute metrics every epoch that are too complex to be written as + # standard Keras metrics, you can use our KerasMetricCallback. See + # https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks + + eval_predictions = model.predict(eval_dataset) + if isinstance(eval_predictions.start_logits, tf.RaggedTensor): + # If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea! + # The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even + # the highest probability in a sample. Instead, we use a large negative value, which ensures that the + # padding positions are correctly masked. + eval_start_logits = eval_predictions.start_logits.to_tensor(default_value=-1000).numpy() + eval_end_logits = eval_predictions.end_logits.to_tensor(default_value=-1000).numpy() + else: + eval_start_logits = eval_predictions.start_logits + eval_end_logits = eval_predictions.end_logits post_processed_eval = post_processing_function( datasets["validation"], processed_datasets["validation"], - (eval_predictions.start_logits, eval_predictions.end_logits), + (eval_start_logits, eval_end_logits), ) metrics = compute_metrics(post_processed_eval) logging.info("Evaluation metrics:") for metric, value in metrics.items(): logging.info(f"{metric}: {value:.3f}") + if training_args.output_dir is not None: + output_eval_file = os.path.join(training_args.output_dir, "all_results.json") + with open(output_eval_file, "w") as writer: + writer.write(json.dumps(metrics)) # endregion # region Prediction if training_args.do_predict: logger.info("*** Predict ***") - predict_inputs = { - "input_ids": tf.ragged.constant(processed_datasets["test"]["input_ids"]).to_tensor(), - "attention_mask": tf.ragged.constant(processed_datasets["test"]["attention_mask"]).to_tensor(), - } - test_predictions = model.predict(predict_inputs) + + test_predictions = model.predict(predict_dataset) + if isinstance(test_predictions.start_logits, tf.RaggedTensor): + # If predictions are RaggedTensor, we densify them. Since they are logits, padding with 0 is a bad idea! + # The reason is that a logit of 0 can often end up as quite a high probability value, sometimes even + # the highest probability in a sample. Instead, we use a large negative value, which ensures that the + # padding positions are correctly masked. + test_start_logits = test_predictions.start_logits.to_tensor(default_value=-1000).numpy() + test_end_logits = test_predictions.end_logits.to_tensor(default_value=-1000).numpy() + else: + test_start_logits = test_predictions.start_logits + test_end_logits = test_predictions.end_logits post_processed_test = post_processing_function( datasets["test"], processed_datasets["test"], - (test_predictions.start_logits, test_predictions.end_logits), + (test_start_logits, test_end_logits), ) metrics = compute_metrics(post_processed_test) @@ -694,8 +792,9 @@ def main(): logging.info(f"{metric}: {value:.3f}") # endregion - if training_args.push_to_hub: - model.push_to_hub() + if training_args.output_dir is not None and not training_args.push_to_hub: + # If we're not pushing to hub, at least save a local copy when we're done + model.save_pretrained(training_args.output_dir) if __name__ == "__main__": diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 6d4cf99e67..2cf6bdba60 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -18,11 +18,11 @@ Fine-tuning the library models for summarization. """ # You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments. +import json import logging import os import sys from dataclasses import dataclass, field -from functools import partial from typing import Optional import datasets @@ -30,7 +30,6 @@ import nltk # Here to have a nice missing dependency error message early on import numpy as np import tensorflow as tf from datasets import load_dataset -from tqdm import tqdm import evaluate import transformers @@ -38,7 +37,10 @@ from filelock import FileLock from transformers import ( AutoConfig, AutoTokenizer, + DataCollatorForSeq2Seq, HfArgumentParser, + KerasMetricCallback, + PushToHubCallback, TFAutoModelForSeq2SeqLM, TFTrainingArguments, create_optimizer, @@ -253,7 +255,6 @@ class DataTrainingArguments: # endregion - # region Dataset name mappings summarization_name_mapping = { "amazon_reviews_multi": ("review_body", "review_title"), @@ -272,71 +273,6 @@ summarization_name_mapping = { # endregion -# region Data generator -def sample_generator(dataset, model, tokenizer, shuffle, pad_to_multiple_of=None): - if shuffle: - sample_ordering = np.random.permutation(len(dataset)) - else: - sample_ordering = np.arange(len(dataset)) - for sample_idx in sample_ordering: - example = dataset[int(sample_idx)] - # Handle dicts with proper padding and conversion to tensor. - example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of) - example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()} - if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"): - decoder_input_ids = model.prepare_decoder_input_ids_from_labels( - labels=tf.expand_dims(example["labels"], 0) - ) - example["decoder_input_ids"] = tf.squeeze(decoder_input_ids, 0) - yield example, example["labels"] # TF needs some kind of labels, even if we don't use them - return - - -# endregion - - -# region Helper functions -def dataset_to_tf(dataset, model, tokenizer, total_batch_size, num_epochs, shuffle): - if dataset is None: - return None - train_generator = partial(sample_generator, dataset, model, tokenizer, shuffle=shuffle) - train_signature = { - feature: tf.TensorSpec(shape=(None,), dtype=tf.int32) - for feature in dataset.features - if feature != "special_tokens_mask" - } - if ( - model is not None - and "decoder_input_ids" not in train_signature - and hasattr(model, "prepare_decoder_input_ids_from_labels") - ): - train_signature["decoder_input_ids"] = train_signature["labels"] - # This may need to be changed depending on your particular model or tokenizer! - padding_values = { - key: tf.convert_to_tensor(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0, dtype=tf.int32) - for key in train_signature.keys() - } - padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int32) - train_signature["labels"] = train_signature["input_ids"] - train_signature = (train_signature, train_signature["labels"]) - options = tf.data.Options() - options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF - tf_dataset = ( - tf.data.Dataset.from_generator(train_generator, output_signature=train_signature) - .with_options(options) - .padded_batch( - batch_size=total_batch_size, - drop_remainder=True, - padding_values=(padding_values, np.array(-100, dtype=np.int32)), - ) - .repeat(int(num_epochs)) - ) - return tf_dataset - - -# endregion - - def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py @@ -587,59 +523,148 @@ def main(): if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") + label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id + data_collator = DataCollatorForSeq2Seq( + tokenizer, + model=model, + label_pad_token_id=label_pad_token_id, + pad_to_multiple_of=128, # Reduce the number of unique shapes for XLA, especially for generation + return_tensors="tf", + ) + + dataset_options = tf.data.Options() + dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF + num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas - tf_train_dataset = dataset_to_tf( + + # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in + # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also + # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names + # yourself if you use this method, whereas they are automatically inferred from the model input names when + # using model.prepare_tf_dataset() + # For more info see the docs: + # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset + # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset + + tf_train_dataset = model.prepare_tf_dataset( train_dataset, - model, - tokenizer, - total_batch_size=total_train_batch_size, - num_epochs=training_args.num_train_epochs, + collate_fn=data_collator, + batch_size=total_train_batch_size, shuffle=True, - ) - tf_eval_dataset = dataset_to_tf( + ).with_options(dataset_options) + tf_eval_dataset = model.prepare_tf_dataset( eval_dataset, - model, - tokenizer, - total_eval_batch_size, - num_epochs=1, + collate_fn=data_collator, + batch_size=total_eval_batch_size, shuffle=False, - ) + ).with_options(dataset_options) # endregion # region Optimizer, loss and LR scheduling - # Scheduler and math around the number of training steps. - num_update_steps_per_epoch = len(train_dataset) // total_train_batch_size - num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch - optimizer, lr_schedule = create_optimizer( - init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=0 - ) - - def masked_sparse_categorical_crossentropy(y_true, y_pred): - # We clip the negative labels to 0 to avoid NaNs appearing in the output and - # fouling up everything that comes afterwards. The loss values corresponding to clipped values - # will be masked later anyway, but even masked NaNs seem to cause overflows for some reason. - # 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely - # event that you have more than 1 million tokens in your vocabulary, consider increasing this value. - # More pragmatically, consider redesigning your tokenizer. - losses = tf.keras.losses.sparse_categorical_crossentropy( - tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True + num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs) + if training_args.warmup_steps > 0: + num_warmup_steps = training_args.warmup_steps + elif training_args.warmup_ratio > 0: + num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) + else: + num_warmup_steps = 0 + if training_args.do_train: + optimizer, lr_schedule = create_optimizer( + init_lr=training_args.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + adam_beta1=training_args.adam_beta1, + adam_beta2=training_args.adam_beta2, + adam_epsilon=training_args.adam_epsilon, + weight_decay_rate=training_args.weight_decay, + adam_global_clipnorm=training_args.max_grad_norm, ) - # Compute the per-sample loss only over the unmasked tokens - losses = tf.ragged.boolean_mask(losses, y_true != -100) - losses = tf.reduce_mean(losses, axis=-1) - return losses + else: + optimizer = None # endregion - # region Metric - metric = evaluate.load("rouge") + # region Metric and KerasMetricCallback + if training_args.do_eval: + metric = evaluate.load("rouge") + + if data_args.val_max_target_length is None: + data_args.val_max_target_length = data_args.max_target_length + + gen_kwargs = { + "max_length": data_args.val_max_target_length if data_args is not None else config.max_length, + "num_beams": data_args.num_beams, + "no_repeat_ngram_size": 0, # Not supported under XLA right now, and some models set it by default + } + + def compute_metrics(preds): + predictions, labels = preds + if isinstance(predictions, tuple): + predictions = predictions[0] + decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + metrics = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) + # Only print the mid f-measures, but there are a lot of other statistics in there too! + metrics = {key: round(val.mid.fmeasure * 100, 4) for key, val in metrics.items()} + return metrics + + # The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics + # to be computed each epoch. Any Python code can be included in the metric_fn. This is especially + # useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs. + # For more information, see the docs at + # https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback + + metric_callback = KerasMetricCallback( + metric_fn=compute_metrics, + eval_dataset=tf_eval_dataset, + predict_with_generate=True, + use_xla_generation=True, + generate_kwargs=gen_kwargs, + ) + callbacks = [metric_callback] + else: + callbacks = [] + # endregion + + # region Preparing push_to_hub and model card + push_to_hub_model_id = training_args.push_to_hub_model_id + model_name = model_args.model_name_or_path.split("/")[-1] + if not push_to_hub_model_id: + if data_args.dataset_name is not None: + push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}" + else: + push_to_hub_model_id = f"{model_name}-finetuned-summarization" + + model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"} + if data_args.dataset_name is not None: + model_card_kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + model_card_kwargs["dataset_args"] = data_args.dataset_config_name + model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + model_card_kwargs["dataset"] = data_args.dataset_name + + if training_args.push_to_hub: + # Because this training can be quite long, we save once per epoch. + callbacks.append( + PushToHubCallback( + output_dir=training_args.output_dir, + model_id=push_to_hub_model_id, + organization=training_args.push_to_hub_organization, + token=training_args.push_to_hub_token, + tokenizer=tokenizer, + **model_card_kwargs, + ) + ) # endregion # region Training - model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer) - + model.compile(optimizer=optimizer, jit_compile=training_args.xla) + eval_metrics = None if training_args.do_train: logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") @@ -648,28 +673,29 @@ def main(): logger.info(f" Total train batch size = {total_train_batch_size}") logger.info(f" Total optimization steps = {num_train_steps}") - model.fit( - tf_train_dataset, - epochs=int(training_args.num_train_epochs), - steps_per_epoch=num_update_steps_per_epoch, - ) + if training_args.xla and not data_args.pad_to_max_length: + logger.warning( + "XLA training may be slow at first when --pad_to_max_length is not set " + "until all possible shapes have been compiled." + ) + history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks) + eval_metrics = {key: val[-1] for key, val in history.history.items()} # endregion # region Validation - if data_args.val_max_target_length is None: - data_args.val_max_target_length = data_args.max_target_length - gen_kwargs = { - "max_length": data_args.val_max_target_length if data_args is not None else config.max_length, - "num_beams": data_args.num_beams, - } - if training_args.do_eval: + if training_args.do_eval and not training_args.do_train: + # Do a standalone evaluation run logger.info("Evaluation...") - for batch, labels in tqdm( - tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size - ): + + # Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate + @tf.function(jit_compile=True) + def generate(**kwargs): + return model.generate(**kwargs) + + for batch, labels in tf_eval_dataset: batch.update(gen_kwargs) - generated_tokens = model.generate(**batch) + generated_tokens = generate(**batch) if isinstance(generated_tokens, tuple): generated_tokens = generated_tokens[0] decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) @@ -679,13 +705,19 @@ def main(): metric.add_batch(predictions=decoded_preds, references=decoded_labels) - result = metric.compute(use_stemmer=True) - result = {k: round(v * 100, 4) for k, v in result.items()} + eval_metrics = metric.compute(use_stemmer=True) + result = {key: round(val.mid.fmeasure * 100, 4) for key, val in eval_metrics.items()} logger.info(result) # endregion - if training_args.output_dir is not None: + if training_args.output_dir is not None and eval_metrics is not None: + output_eval_file = os.path.join(training_args.output_dir, "all_results.json") + with open(output_eval_file, "w") as writer: + writer.write(json.dumps(eval_metrics)) + + if training_args.output_dir is not None and not training_args.push_to_hub: + # If we're not pushing to hub, at least save a local copy when we're done model.save_pretrained(training_args.output_dir) diff --git a/examples/tensorflow/test_tensorflow_examples.py b/examples/tensorflow/test_tensorflow_examples.py new file mode 100644 index 0000000000..9b692ce80c --- /dev/null +++ b/examples/tensorflow/test_tensorflow_examples.py @@ -0,0 +1,295 @@ +# coding=utf-8 +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import json +import logging +import os +import sys +from unittest import skip +from unittest.mock import patch + +import tensorflow as tf + +from transformers.testing_utils import TestCasePlus, get_gpu_count, slow + + +SRC_DIRS = [ + os.path.join(os.path.dirname(__file__), dirname) + for dirname in [ + "text-generation", + "text-classification", + "token-classification", + "language-modeling", + "multiple-choice", + "question-answering", + "summarization", + "translation", + ] +] +sys.path.extend(SRC_DIRS) + + +if SRC_DIRS is not None: + import run_clm + import run_mlm + import run_ner + import run_qa as run_squad + import run_summarization + import run_swag + import run_text_classification + import run_translation + + +logging.basicConfig(level=logging.DEBUG) + +logger = logging.getLogger() + + +def get_setup_file(): + parser = argparse.ArgumentParser() + parser.add_argument("-f") + args = parser.parse_args() + return args.f + + +def get_results(output_dir): + results = {} + path = os.path.join(output_dir, "all_results.json") + if os.path.exists(path): + with open(path, "r") as f: + results = json.load(f) + else: + raise ValueError(f"can't find {path}") + return results + + +def is_cuda_available(): + return bool(tf.config.list_physical_devices("GPU")) + + +stream_handler = logging.StreamHandler(sys.stdout) +logger.addHandler(stream_handler) + + +class ExamplesTests(TestCasePlus): + @skip("Skipping until shape inference for to_tf_dataset PR is merged.") + def test_run_text_classification(self): + tmp_dir = self.get_auto_remove_tmp_dir() + testargs = f""" + run_text_classification.py + --model_name_or_path distilbert-base-uncased + --output_dir {tmp_dir} + --overwrite_output_dir + --train_file ./tests/fixtures/tests_samples/MRPC/train.csv + --validation_file ./tests/fixtures/tests_samples/MRPC/dev.csv + --do_train + --do_eval + --per_device_train_batch_size=2 + --per_device_eval_batch_size=1 + --learning_rate=1e-4 + --max_steps=10 + --warmup_steps=2 + --seed=42 + --max_seq_length=128 + """.split() + + if is_cuda_available(): + testargs.append("--fp16") + + with patch.object(sys, "argv", testargs): + run_text_classification.main() + # Reset the mixed precision policy so we don't break other tests + tf.keras.mixed_precision.set_global_policy("float32") + result = get_results(tmp_dir) + self.assertGreaterEqual(result["eval_accuracy"], 0.75) + + def test_run_clm(self): + tmp_dir = self.get_auto_remove_tmp_dir() + testargs = f""" + run_clm.py + --model_name_or_path distilgpt2 + --train_file ./tests/fixtures/sample_text.txt + --validation_file ./tests/fixtures/sample_text.txt + --do_train + --do_eval + --block_size 128 + --per_device_train_batch_size 2 + --per_device_eval_batch_size 1 + --num_train_epochs 2 + --output_dir {tmp_dir} + --overwrite_output_dir + """.split() + + if len(tf.config.list_physical_devices("GPU")) > 1: + # Skipping because there are not enough batches to train the model + would need a drop_last to work. + return + + with patch.object(sys, "argv", testargs): + run_clm.main() + result = get_results(tmp_dir) + self.assertLess(result["eval_perplexity"], 100) + + def test_run_mlm(self): + tmp_dir = self.get_auto_remove_tmp_dir() + testargs = f""" + run_mlm.py + --model_name_or_path distilroberta-base + --train_file ./tests/fixtures/sample_text.txt + --validation_file ./tests/fixtures/sample_text.txt + --max_seq_length 64 + --output_dir {tmp_dir} + --overwrite_output_dir + --do_train + --do_eval + --prediction_loss_only + --num_train_epochs=1 + """.split() + + with patch.object(sys, "argv", testargs): + run_mlm.main() + result = get_results(tmp_dir) + self.assertLess(result["eval_perplexity"], 42) + + def test_run_ner(self): + # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu + epochs = 7 if get_gpu_count() > 1 else 2 + + tmp_dir = self.get_auto_remove_tmp_dir() + testargs = f""" + run_ner.py + --model_name_or_path bert-base-uncased + --train_file tests/fixtures/tests_samples/conll/sample.json + --validation_file tests/fixtures/tests_samples/conll/sample.json + --output_dir {tmp_dir} + --overwrite_output_dir + --do_train + --do_eval + --warmup_steps=2 + --learning_rate=2e-4 + --per_device_train_batch_size=2 + --per_device_eval_batch_size=2 + --num_train_epochs={epochs} + --seed 7 + """.split() + + with patch.object(sys, "argv", testargs): + run_ner.main() + result = get_results(tmp_dir) + self.assertGreaterEqual(result["accuracy"], 0.75) + + def test_run_squad(self): + tmp_dir = self.get_auto_remove_tmp_dir() + testargs = f""" + run_qa.py + --model_name_or_path bert-base-uncased + --version_2_with_negative + --train_file tests/fixtures/tests_samples/SQUAD/sample.json + --validation_file tests/fixtures/tests_samples/SQUAD/sample.json + --output_dir {tmp_dir} + --overwrite_output_dir + --max_steps=10 + --warmup_steps=2 + --do_train + --do_eval + --learning_rate=2e-4 + --per_device_train_batch_size=2 + --per_device_eval_batch_size=1 + """.split() + + with patch.object(sys, "argv", testargs): + run_squad.main() + result = get_results(tmp_dir) + self.assertGreaterEqual(result["f1"], 30) + self.assertGreaterEqual(result["exact"], 30) + + def test_run_swag(self): + tmp_dir = self.get_auto_remove_tmp_dir() + testargs = f""" + run_swag.py + --model_name_or_path bert-base-uncased + --train_file tests/fixtures/tests_samples/swag/sample.json + --validation_file tests/fixtures/tests_samples/swag/sample.json + --output_dir {tmp_dir} + --overwrite_output_dir + --max_steps=20 + --warmup_steps=2 + --do_train + --do_eval + --learning_rate=2e-4 + --per_device_train_batch_size=2 + --per_device_eval_batch_size=1 + """.split() + + with patch.object(sys, "argv", testargs): + run_swag.main() + result = get_results(tmp_dir) + self.assertGreaterEqual(result["val_accuracy"], 0.8) + + @slow + def test_run_summarization(self): + tmp_dir = self.get_auto_remove_tmp_dir() + testargs = f""" + run_summarization.py + --model_name_or_path t5-small + --train_file tests/fixtures/tests_samples/xsum/sample.json + --validation_file tests/fixtures/tests_samples/xsum/sample.json + --output_dir {tmp_dir} + --overwrite_output_dir + --max_steps=50 + --warmup_steps=8 + --do_train + --do_eval + --learning_rate=2e-4 + --per_device_train_batch_size=2 + --per_device_eval_batch_size=1 + """.split() + + with patch.object(sys, "argv", testargs): + run_summarization.main() + result = get_results(tmp_dir) + self.assertGreaterEqual(result["rouge1"], 10) + self.assertGreaterEqual(result["rouge2"], 2) + self.assertGreaterEqual(result["rougeL"], 7) + self.assertGreaterEqual(result["rougeLsum"], 7) + + @slow + def test_run_translation(self): + tmp_dir = self.get_auto_remove_tmp_dir() + testargs = f""" + run_translation.py + --model_name_or_path Rocketknight1/student_marian_en_ro_6_1 + --source_lang en + --target_lang ro + --train_file tests/fixtures/tests_samples/wmt16/sample.json + --validation_file tests/fixtures/tests_samples/wmt16/sample.json + --output_dir {tmp_dir} + --overwrite_output_dir + --warmup_steps=8 + --do_train + --do_eval + --learning_rate=3e-3 + --num_train_epochs 12 + --per_device_train_batch_size=2 + --per_device_eval_batch_size=1 + --source_lang en_XX + --target_lang ro_RO + """.split() + + with patch.object(sys, "argv", testargs): + run_translation.main() + result = get_results(tmp_dir) + self.assertGreaterEqual(result["bleu"], 30) diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index 9fb0b3f8e4..d5a6b096b3 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -16,6 +16,7 @@ """ Finetuning the library models for sequence classification on GLUE.""" # You can also adapt this script on your own text classification task. Pointers for this are left as comments. +import json import logging import os import sys @@ -35,32 +36,16 @@ from transformers import ( DefaultDataCollator, HfArgumentParser, PretrainedConfig, + PushToHubCallback, TFAutoModelForSequenceClassification, TFTrainingArguments, + create_optimizer, set_seed, ) from transformers.trainer_utils import get_last_checkpoint, is_main_process from transformers.utils import check_min_version, send_example_telemetry -# region Helper functions - - -class SavePretrainedCallback(tf.keras.callbacks.Callback): - # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary - # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback - # that saves the model with this method after each epoch. - def __init__(self, output_dir, **kwargs): - super().__init__() - self.output_dir = output_dir - - def on_epoch_end(self, epoch, logs=None): - self.model.save_pretrained(self.output_dir) - - -# endregion - - # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.22.0.dev0") @@ -312,7 +297,6 @@ def main(): # region Dataset preprocessing sentence1_key, sentence2_key = task_to_keys[data_args.task_name] - non_label_column_names = [name for name in datasets["train"].column_names if name != "label"] # Padding strategy if data_args.pad_to_max_length: @@ -394,24 +378,11 @@ def main(): ) # endregion - # region Optimizer, loss and compilation - optimizer = tf.keras.optimizers.Adam( - learning_rate=training_args.learning_rate, - beta_1=training_args.adam_beta1, - beta_2=training_args.adam_beta2, - epsilon=training_args.adam_epsilon, - clipnorm=training_args.max_grad_norm, - ) - if is_regression: - loss_fn = tf.keras.losses.MeanSquaredError() - metrics = [] - else: - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metrics = ["accuracy"] - model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) - # endregion - # region Convert data to a tf.data.Dataset + dataset_options = tf.data.Options() + dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF + num_replicas = training_args.strategy.num_replicas_in_sync + tf_data = dict() max_samples = { "train": data_args.max_train_samples, @@ -428,31 +399,89 @@ def main(): assert "label" in datasets[key].features, f"Missing labels from {key} data!" if key == "train": shuffle = True - batch_size = training_args.per_device_train_batch_size - drop_remainder = True # Saves us worrying about scaling gradients for the last batch + batch_size = training_args.per_device_train_batch_size * num_replicas else: shuffle = False - batch_size = training_args.per_device_eval_batch_size - drop_remainder = False + batch_size = training_args.per_device_eval_batch_size * num_replicas samples_limit = max_samples[key] dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) - data = dataset.to_tf_dataset( - columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])], + + # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in + # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also + # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names + # yourself if you use this method, whereas they are automatically inferred from the model input names when + # using model.prepare_tf_dataset() + # For more info see the docs: + # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset + # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset + data = model.prepare_tf_dataset( + dataset, shuffle=shuffle, batch_size=batch_size, collate_fn=data_collator, - drop_remainder=drop_remainder, - # `label_cols` is needed for user-defined losses, such as in this example - label_cols="label" if "label" in dataset.column_names else None, + tokenizer=tokenizer, ) + data = data.with_options(dataset_options) tf_data[key] = data # endregion + # region Optimizer, loss and compilation + if training_args.do_train: + num_train_steps = len(tf_data["train"]) * training_args.num_train_epochs + if training_args.warmup_steps > 0: + num_warmup_steps = training_args.warmup_steps + elif training_args.warmup_ratio > 0: + num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) + else: + num_warmup_steps = 0 + + optimizer, schedule = create_optimizer( + init_lr=training_args.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + adam_beta1=training_args.adam_beta1, + adam_beta2=training_args.adam_beta2, + adam_epsilon=training_args.adam_epsilon, + weight_decay_rate=training_args.weight_decay, + adam_global_clipnorm=training_args.max_grad_norm, + ) + else: + optimizer = "adam" # Just write anything because we won't be using it + if is_regression: + metrics = [] + else: + metrics = ["accuracy"] + model.compile(optimizer=optimizer, metrics=metrics, jit_compile=training_args.xla) + # endregion + + # region Preparing push_to_hub and model card + push_to_hub_model_id = training_args.push_to_hub_model_id + model_name = model_args.model_name_or_path.split("/")[-1] + if not push_to_hub_model_id: + push_to_hub_model_id = f"{model_name}-finetuned-glue" + + model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"} + model_card_kwargs["task_name"] = data_args.task_name + + if training_args.push_to_hub: + callbacks = [ + PushToHubCallback( + output_dir=training_args.output_dir, + model_id=push_to_hub_model_id, + organization=training_args.push_to_hub_organization, + token=training_args.push_to_hub_token, + tokenizer=tokenizer, + **model_card_kwargs, + ) + ] + else: + callbacks = [] + # endregion + # region Training and validation if training_args.do_train: - callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)] if training_args.do_eval and not data_args.task_name == "mnli": # Do both evaluation and training in the Keras fit loop, unless the task is MNLI # because MNLI has two validation sets @@ -472,6 +501,12 @@ def main(): # We normally do validation as part of the Keras fit loop, but we run it independently # if there was no fit() step (because we didn't train the model) or if the task is MNLI, # because MNLI has a separate validation-mismatched validation set + + # In this example, we compute advanced metrics only at the end of training, and only compute + # loss and accuracy on the validation set each epoch, but + # if you'd like to compute metrics every epoch that are too complex to be written as + # standard Keras metrics, you can use our KerasMetricCallback. See + # https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) @@ -489,6 +524,10 @@ def main(): eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"]) print(f"Evaluation metrics ({task}):") print(eval_metrics) + if training_args.output_dir is not None: + output_eval_file = os.path.join(training_args.output_dir, "all_results.json") + with open(output_eval_file, "w") as writer: + writer.write(json.dumps(eval_metrics)) # endregion @@ -538,6 +577,10 @@ def main(): writer.write(f"{index}\t{item}\n") # endregion + if training_args.output_dir is not None and not training_args.push_to_hub: + # If we're not pushing to hub, at least save a local copy when we're done + model.save_pretrained(training_args.output_dir) + if __name__ == "__main__": main() diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py index b5d1903297..0cf1972e93 100644 --- a/examples/tensorflow/text-classification/run_text_classification.py +++ b/examples/tensorflow/text-classification/run_text_classification.py @@ -16,6 +16,7 @@ """ Fine-tuning the library models for sequence classification.""" # You can also adapt this script on your own text classification task. Pointers for this are left as comments. +import json import logging import os import sys @@ -29,12 +30,12 @@ from datasets import load_dataset from transformers import ( AutoConfig, AutoTokenizer, - DataCollatorWithPadding, - DefaultDataCollator, HfArgumentParser, PretrainedConfig, + PushToHubCallback, TFAutoModelForSequenceClassification, TFTrainingArguments, + create_optimizer, set_seed, ) from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, send_example_telemetry @@ -383,10 +384,6 @@ def main(): datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) - if data_args.pad_to_max_length: - data_collator = DefaultDataCollator(return_tensors="tf") - else: - data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") # endregion with training_args.strategy.scope(): @@ -409,24 +406,10 @@ def main(): ) # endregion - # region Optimizer, loss and compilation - optimizer = tf.keras.optimizers.Adam( - learning_rate=training_args.learning_rate, - beta_1=training_args.adam_beta1, - beta_2=training_args.adam_beta2, - epsilon=training_args.adam_epsilon, - clipnorm=training_args.max_grad_norm, - ) - if is_regression: - loss_fn = tf.keras.losses.MeanSquaredError() - metrics = [] - else: - loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metrics = ["accuracy"] - model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) - # endregion - # region Convert data to a tf.data.Dataset + dataset_options = tf.data.Options() + dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF + num_replicas = training_args.strategy.num_replicas_in_sync tf_data = dict() max_samples = { @@ -438,50 +421,121 @@ def main(): if key not in datasets: tf_data[key] = None continue + if ( + (key == "train" and not training_args.do_train) + or (key == "validation" and not training_args.do_eval) + or (key == "test" and not training_args.do_predict) + ): + tf_data[key] = None + continue if key in ("train", "validation"): assert "label" in datasets[key].features, f"Missing labels from {key} data!" if key == "train": shuffle = True - batch_size = training_args.per_device_train_batch_size - drop_remainder = True # Saves us worrying about scaling gradients for the last batch + batch_size = training_args.per_device_train_batch_size * num_replicas else: shuffle = False - batch_size = training_args.per_device_eval_batch_size - drop_remainder = False + batch_size = training_args.per_device_eval_batch_size * num_replicas samples_limit = max_samples[key] dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) - data = dataset.to_tf_dataset( - columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])], + + # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in + # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also + # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names + # yourself if you use this method, whereas they are automatically inferred from the model input names when + # using model.prepare_tf_dataset() + # For more info see the docs: + # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset + # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset + + data = model.prepare_tf_dataset( + dataset, shuffle=shuffle, batch_size=batch_size, - collate_fn=data_collator, - drop_remainder=drop_remainder, - # `label_cols` is needed for user-defined losses, such as in this example - label_cols="label" if "label" in dataset.column_names else None, + tokenizer=tokenizer, ) + data = data.with_options(dataset_options) tf_data[key] = data # endregion + # region Optimizer, loss and compilation + + if training_args.do_train: + num_train_steps = len(tf_data["train"]) * training_args.num_train_epochs + if training_args.warmup_steps > 0: + num_warmup_steps = training_args.warmup_steps + elif training_args.warmup_ratio > 0: + num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) + else: + num_warmup_steps = 0 + + optimizer, schedule = create_optimizer( + init_lr=training_args.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + adam_beta1=training_args.adam_beta1, + adam_beta2=training_args.adam_beta2, + adam_epsilon=training_args.adam_epsilon, + weight_decay_rate=training_args.weight_decay, + adam_global_clipnorm=training_args.max_grad_norm, + ) + else: + optimizer = None + if is_regression: + metrics = [] + else: + metrics = ["accuracy"] + model.compile(optimizer=optimizer, metrics=metrics) + # endregion + + # region Preparing push_to_hub and model card + push_to_hub_model_id = training_args.push_to_hub_model_id + model_name = model_args.model_name_or_path.split("/")[-1] + if not push_to_hub_model_id: + push_to_hub_model_id = f"{model_name}-finetuned-text-classification" + + model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"} + + if training_args.push_to_hub: + callbacks = [ + PushToHubCallback( + output_dir=training_args.output_dir, + model_id=push_to_hub_model_id, + organization=training_args.push_to_hub_organization, + token=training_args.push_to_hub_token, + tokenizer=tokenizer, + **model_card_kwargs, + ) + ] + else: + callbacks = [] + # endregion + # region Training and validation if tf_data["train"] is not None: - callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)] model.fit( tf_data["train"], validation_data=tf_data["validation"], epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) - elif tf_data["validation"] is not None: - # If there's a validation dataset but no training set, just evaluate the metrics + if tf_data["validation"] is not None: logger.info("Computing metrics on validation data...") if is_regression: loss = model.evaluate(tf_data["validation"]) - logger.info(f"Loss: {loss:.5f}") + logger.info(f"Eval loss: {loss:.5f}") else: loss, accuracy = model.evaluate(tf_data["validation"]) - logger.info(f"Loss: {loss:.5f}, Accuracy: {accuracy * 100:.4f}%") + logger.info(f"Eval loss: {loss:.5f}, Eval accuracy: {accuracy * 100:.4f}%") + if training_args.output_dir is not None: + output_eval_file = os.path.join(training_args.output_dir, "all_results.json") + eval_dict = {"eval_loss": loss} + if not is_regression: + eval_dict["eval_accuracy"] = accuracy + with open(output_eval_file, "w") as writer: + writer.write(json.dumps(eval_dict)) # endregion # region Prediction @@ -501,14 +555,9 @@ def main(): logger.info(f"Wrote predictions to {output_test_file}!") # endregion - # region Prediction losses - # This section is outside the scope() because it's very quick to compute, but behaves badly inside it - if "test" in datasets and "label" in datasets["test"].features: - print("Computing prediction loss on test labels...") - labels = datasets["test"]["label"] - loss = float(loss_fn(labels, predictions).numpy()) - print(f"Test loss: {loss:.4f}") - # endregion + if training_args.output_dir is not None and not training_args.push_to_hub: + # If we're not pushing to hub, at least save a local copy when we're done + model.save_pretrained(training_args.output_dir) if __name__ == "__main__": diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py index caa47e115a..8eb9aef92b 100644 --- a/examples/tensorflow/token-classification/run_ner.py +++ b/examples/tensorflow/token-classification/run_ner.py @@ -18,14 +18,14 @@ Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, C without using a Trainer. """ +import json import logging +import os import random from dataclasses import dataclass, field -from functools import partial from typing import Optional import datasets -import numpy as np import tensorflow as tf from datasets import ClassLabel, load_dataset @@ -33,10 +33,11 @@ import evaluate import transformers from transformers import ( CONFIG_MAPPING, - MODEL_MAPPING, AutoConfig, AutoTokenizer, + DataCollatorForTokenClassification, HfArgumentParser, + PushToHubCallback, TFAutoModelForTokenClassification, TFTrainingArguments, create_optimizer, @@ -48,11 +49,7 @@ from transformers.utils.versions import require_version logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") - -# You should update this to your particular problem to have better documentation of `model_type` -MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) +require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/token-classification/requirements.txt") # region Command-line arguments @@ -195,61 +192,6 @@ class DataTrainingArguments: # endregion -# region Data generator -def sample_generator(dataset, tokenizer, shuffle, pad_to_multiple_of=None): - # Trim off the last partial batch if present - if shuffle: - sample_ordering = np.random.permutation(len(dataset)) - else: - sample_ordering = np.arange(len(dataset)) - for sample_idx in sample_ordering: - example = dataset[int(sample_idx)] - # Handle dicts with proper padding and conversion to tensor. - example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of) - if tokenizer.pad_token_id is not None: - example["labels"][example["attention_mask"] == 0] = -100 - example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()} - - yield example, example["labels"] # TF needs some kind of labels, even if we don't use them - return - - -# endregion - - -# region Helper functions -def dataset_to_tf(dataset, tokenizer, total_batch_size, num_epochs, shuffle): - train_generator = partial(sample_generator, dataset, tokenizer, shuffle=shuffle) - train_signature = { - feature: tf.TensorSpec(shape=(None,), dtype=tf.int64) - for feature in dataset.features - if feature != "special_tokens_mask" - } - # This may need to be changed depending on your particular model or tokenizer! - padding_values = {key: tf.convert_to_tensor(0, dtype=tf.int64) for key in dataset.features} - padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int64) - if tokenizer.pad_token_id is not None: - padding_values["input_ids"] = tf.convert_to_tensor(tokenizer.pad_token_id, dtype=tf.int64) - train_signature["labels"] = train_signature["input_ids"] - train_signature = (train_signature, train_signature["labels"]) - options = tf.data.Options() - options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF - tf_dataset = ( - tf.data.Dataset.from_generator(train_generator, output_signature=train_signature) - .with_options(options) - .padded_batch( - batch_size=total_batch_size, - drop_remainder=True, - padding_values=(padding_values, np.array(0, dtype=np.int64)), - ) - .repeat(int(num_epochs)) - ) - return tf_dataset - - -# endregion - - def main(): # region Argument Parsing parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) @@ -419,6 +361,14 @@ def main(): train_dataset = processed_raw_datasets["train"] eval_dataset = processed_raw_datasets["validation"] + if data_args.max_train_samples is not None: + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) + + if data_args.max_eval_samples is not None: + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) + # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") @@ -439,43 +389,62 @@ def main(): # endregion # region Create TF datasets + + # We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as + # well as inputs. + collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf") num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas - train_batches_per_epoch = len(train_dataset) // total_train_batch_size - tf_train_dataset = dataset_to_tf( + + dataset_options = tf.data.Options() + dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF + + # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in + # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also + # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names + # yourself if you use this method, whereas they are automatically inferred from the model input names when + # using model.prepare_tf_dataset() + # For more info see the docs: + # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset + # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset + + tf_train_dataset = model.prepare_tf_dataset( train_dataset, - tokenizer, - total_batch_size=total_train_batch_size, - num_epochs=training_args.num_train_epochs, + collate_fn=collate_fn, + batch_size=total_train_batch_size, shuffle=True, - ) + ).with_options(dataset_options) total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas - eval_batches_per_epoch = len(eval_dataset) // total_eval_batch_size - tf_eval_dataset = dataset_to_tf( + tf_eval_dataset = model.prepare_tf_dataset( eval_dataset, - tokenizer, - total_batch_size=total_eval_batch_size, - num_epochs=training_args.num_train_epochs, + collate_fn=collate_fn, + batch_size=total_eval_batch_size, shuffle=False, - ) + ).with_options(dataset_options) # endregion # region Optimizer, loss and compilation + num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs) + if training_args.warmup_steps > 0: + num_warmup_steps = training_args.warmup_steps + elif training_args.warmup_ratio > 0: + num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) + else: + num_warmup_steps = 0 + optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, - num_train_steps=int(training_args.num_train_epochs * train_batches_per_epoch), - num_warmup_steps=training_args.warmup_steps, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, + adam_global_clipnorm=training_args.max_grad_norm, ) - def dummy_loss(y_true, y_pred): - return tf.reduce_mean(y_pred) - - model.compile(loss={"loss": dummy_loss}, optimizer=optimizer) + model.compile(optimizer=optimizer, jit_compile=training_args.xla) # endregion # Metrics @@ -517,6 +486,39 @@ def main(): # endregion + # region Preparing push_to_hub and model card + push_to_hub_model_id = training_args.push_to_hub_model_id + model_name = model_args.model_name_or_path.split("/")[-1] + if not push_to_hub_model_id: + if data_args.dataset_name is not None: + push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}" + else: + push_to_hub_model_id = f"{model_name}-finetuned-token-classification" + + model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "token-classification"} + if data_args.dataset_name is not None: + model_card_kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + model_card_kwargs["dataset_args"] = data_args.dataset_config_name + model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + model_card_kwargs["dataset"] = data_args.dataset_name + + if training_args.push_to_hub: + callbacks = [ + PushToHubCallback( + output_dir=training_args.output_dir, + model_id=push_to_hub_model_id, + organization=training_args.push_to_hub_organization, + token=training_args.push_to_hub_token, + tokenizer=tokenizer, + **model_card_kwargs, + ) + ] + else: + callbacks = [] + # endregion + # region Training logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") @@ -524,23 +526,43 @@ def main(): logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}") logger.info(f" Total train batch size = {total_train_batch_size}") # Only show the progress bar once on each machine. + model.fit( tf_train_dataset, validation_data=tf_eval_dataset, epochs=int(training_args.num_train_epochs), - steps_per_epoch=train_batches_per_epoch, - validation_steps=eval_batches_per_epoch, + callbacks=callbacks, ) # endregion # region Predictions - # For predictions, we preload the entire validation set - note that if you have a really giant validation - # set, you might need to change this! - eval_inputs = {key: tf.ragged.constant(eval_dataset[key]).to_tensor() for key in eval_dataset.features} - predictions = model.predict(eval_inputs, batch_size=training_args.per_device_eval_batch_size)["logits"] - predictions = tf.math.argmax(predictions, axis=-1) - labels = np.array(eval_inputs["labels"]) - labels[np.array(eval_inputs["attention_mask"]) == 0] = -100 + # If you have variable batch sizes (i.e. not using pad_to_max_length), then + # this bit might fail on TF < 2.8 because TF can't concatenate outputs of varying seq + # length from predict(). + + try: + predictions = model.predict(tf_eval_dataset, batch_size=training_args.per_device_eval_batch_size)["logits"] + except tf.python.framework.errors_impl.InvalidArgumentError: + raise ValueError( + "Concatenating predictions failed! If your version of TensorFlow is 2.8.0 or older " + "then you will need to use --pad_to_max_length to generate predictions, as older " + "versions of TensorFlow cannot concatenate variable-length predictions as RaggedTensor." + ) + if isinstance(predictions, tf.RaggedTensor): + predictions = predictions.to_tensor(default_value=-100) + predictions = tf.math.argmax(predictions, axis=-1).numpy() + if "label" in eval_dataset: + labels = eval_dataset.with_format("tf")["label"] + else: + labels = eval_dataset.with_format("tf")["labels"] + if isinstance(labels, tf.RaggedTensor): + labels = labels.to_tensor(default_value=-100) + labels = labels.numpy() + attention_mask = eval_dataset.with_format("tf")["attention_mask"] + if isinstance(attention_mask, tf.RaggedTensor): + attention_mask = attention_mask.to_tensor(default_value=-100) + attention_mask = attention_mask.numpy() + labels[attention_mask == 0] = -100 preds, refs = get_labels(predictions, labels) metric.add_batch( predictions=preds, @@ -550,12 +572,15 @@ def main(): logger.info("Evaluation metrics:") for key, val in eval_metric.items(): logger.info(f"{key}: {val:.4f}") + + if training_args.output_dir is not None: + output_eval_file = os.path.join(training_args.output_dir, "all_results.json") + with open(output_eval_file, "w") as writer: + writer.write(json.dumps(eval_metric)) # endregion - # We don't do predictions in the strategy scope because there are some issues in there right now. - # They'll get fixed eventually, promise! - - if training_args.output_dir is not None: + if training_args.output_dir is not None and not training_args.push_to_hub: + # If we're not pushing to hub, at least save a local copy when we're done model.save_pretrained(training_args.output_dir) diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 7f5eb9eb9d..7ccd089ca8 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -18,30 +18,32 @@ Fine-tuning the library models for translation. """ # You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments. +import json import logging import os import sys from dataclasses import dataclass, field -from functools import partial from typing import Optional import datasets import numpy as np import tensorflow as tf from datasets import load_dataset -from tqdm import tqdm import evaluate import transformers from transformers import ( AutoConfig, AutoTokenizer, + DataCollatorForSeq2Seq, HfArgumentParser, + KerasMetricCallback, M2M100Tokenizer, MBart50Tokenizer, MBart50TokenizerFast, MBartTokenizer, MBartTokenizerFast, + PushToHubCallback, TFAutoModelForSeq2SeqLM, TFTrainingArguments, create_optimizer, @@ -224,6 +226,16 @@ class DataTrainingArguments: source_prefix: Optional[str] = field( default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."} ) + forced_bos_token: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The token to force as the first generated token after the :obj:`decoder_start_token_id`.Useful for" + " multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token needs to" + " be the target language token.(Usually it is the target language token)" + ) + }, + ) def __post_init__(self): if self.dataset_name is None and self.train_file is None and self.validation_file is None: @@ -239,70 +251,6 @@ class DataTrainingArguments: self.val_max_target_length = self.max_target_length -# endregion - -# region Data generator -def sample_generator(dataset, model, tokenizer, shuffle, pad_to_multiple_of=None): - if shuffle: - sample_ordering = np.random.permutation(len(dataset)) - else: - sample_ordering = np.arange(len(dataset)) - for sample_idx in sample_ordering: - example = dataset[int(sample_idx)] - # Handle dicts with proper padding and conversion to tensor. - example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of) - example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()} - if model is not None and hasattr(model, "prepare_decoder_input_ids_from_labels"): - decoder_input_ids = model.prepare_decoder_input_ids_from_labels( - labels=tf.expand_dims(example["labels"], 0) - ) - example["decoder_input_ids"] = tf.squeeze(decoder_input_ids, 0) - yield example, example["labels"] # TF needs some kind of labels, even if we don't use them - return - - -# endregion - - -# region Helper functions -def dataset_to_tf(dataset, model, tokenizer, total_batch_size, num_epochs, shuffle): - if dataset is None: - return None - train_generator = partial(sample_generator, dataset, model, tokenizer, shuffle=shuffle) - train_signature = { - feature: tf.TensorSpec(shape=(None,), dtype=tf.int32) - for feature in dataset.features - if feature != "special_tokens_mask" - } - if ( - model is not None - and "decoder_input_ids" not in train_signature - and hasattr(model, "prepare_decoder_input_ids_from_labels") - ): - train_signature["decoder_input_ids"] = train_signature["labels"] - # This may need to be changed depending on your particular model or tokenizer! - padding_values = { - key: tf.convert_to_tensor(tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0, dtype=tf.int32) - for key in train_signature.keys() - } - padding_values["labels"] = tf.convert_to_tensor(-100, dtype=tf.int32) - train_signature["labels"] = train_signature["input_ids"] - train_signature = (train_signature, train_signature["labels"]) - options = tf.data.Options() - options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF - tf_dataset = ( - tf.data.Dataset.from_generator(train_generator, output_signature=train_signature) - .with_options(options) - .padded_batch( - batch_size=total_batch_size, - drop_remainder=True, - padding_values=(padding_values, np.array(-100, dtype=np.int32)), - ) - .repeat(int(num_epochs)) - ) - return tf_dataset - - # endregion @@ -541,67 +489,149 @@ def main(): # endregion # region Prepare TF Dataset objects + label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id + data_collator = DataCollatorForSeq2Seq( + tokenizer, + model=model, + label_pad_token_id=label_pad_token_id, + pad_to_multiple_of=64, # Reduce the number of unique shapes for XLA, especially for generation + return_tensors="tf", + ) num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas - tf_train_dataset = dataset_to_tf( + + dataset_options = tf.data.Options() + dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF + + # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in + # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also + # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names + # yourself if you use this method, whereas they are automatically inferred from the model input names when + # using model.prepare_tf_dataset() + # For more info see the docs: + # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset + # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset + + tf_train_dataset = model.prepare_tf_dataset( train_dataset, - model, - tokenizer, - total_batch_size=total_train_batch_size, - num_epochs=training_args.num_train_epochs, + collate_fn=data_collator, + batch_size=total_train_batch_size, shuffle=True, - ) - tf_eval_dataset = dataset_to_tf( - eval_dataset, - model, - tokenizer, - total_eval_batch_size, - num_epochs=1, - shuffle=False, - ) + ).with_options(dataset_options) + tf_eval_dataset = model.prepare_tf_dataset( + eval_dataset, collate_fn=data_collator, batch_size=total_eval_batch_size, shuffle=False + ).with_options(dataset_options) # endregion - # region Optimizer, loss and LR scheduling - # Scheduler and math around the number of training steps. - num_update_steps_per_epoch = len(train_dataset) // training_args.per_device_train_batch_size - num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch - optimizer, lr_schedule = create_optimizer( - init_lr=training_args.learning_rate, - num_train_steps=num_train_steps, - num_warmup_steps=training_args.warmup_steps, - ) - - def masked_sparse_categorical_crossentropy(y_true, y_pred): - # We clip the negative labels to 0 to avoid NaNs appearing in the output and - # fouling up everything that comes afterwards. The loss values corresponding to clipped values - # will be masked later anyway, but even masked NaNs seem to cause overflows for some reason. - # 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely - # event that you have more than 1 million tokens in your vocabulary, consider increasing this value. - # More pragmatically, consider redesigning your tokenizer. - losses = tf.keras.losses.sparse_categorical_crossentropy( - tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True + # region Optimizer and LR scheduling + num_train_steps = int(len(tf_train_dataset) * training_args.num_train_epochs) + if training_args.warmup_steps > 0: + num_warmup_steps = training_args.warmup_steps + elif training_args.warmup_ratio > 0: + num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) + else: + num_warmup_steps = 0 + if training_args.do_train: + optimizer, lr_schedule = create_optimizer( + init_lr=training_args.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + adam_beta1=training_args.adam_beta1, + adam_beta2=training_args.adam_beta2, + adam_epsilon=training_args.adam_epsilon, + weight_decay_rate=training_args.weight_decay, + adam_global_clipnorm=training_args.max_grad_norm, ) - # Compute the per-sample loss only over the unmasked tokens - losses = tf.ragged.boolean_mask(losses, y_true != -100) - losses = tf.reduce_mean(losses, axis=-1) - return losses - + else: + optimizer = None # endregion # region Metric and postprocessing - metric = evaluate.load("sacrebleu") + if training_args.do_eval: + metric = evaluate.load("sacrebleu") - def postprocess_text(preds, labels): - preds = [pred.strip() for pred in preds] - labels = [[label.strip()] for label in labels] + if data_args.val_max_target_length is None: + data_args.val_max_target_length = data_args.max_target_length - return preds, labels + gen_kwargs = { + "max_length": data_args.val_max_target_length, + "num_beams": data_args.num_beams, + "no_repeat_ngram_size": 0, # Not supported under XLA right now, and some models set it by default + } + + def postprocess_text(preds, labels): + preds = [pred.strip() for pred in preds] + labels = [[label.strip()] for label in labels] + + return preds, labels + + def compute_metrics(preds): + predictions, labels = preds + if isinstance(predictions, tuple): + predictions = predictions[0] + decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + metrics = metric.compute(predictions=decoded_preds, references=decoded_labels) + return {"bleu": metrics["score"]} + + # The KerasMetricCallback allows metrics that are too complex to write as standard Keras metrics + # to be computed each epoch. Any Python code can be included in the metric_fn. This is especially + # useful for metrics like BLEU and ROUGE that perform string comparisons on decoded model outputs. + # For more information, see the docs at + # https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.KerasMetricCallback + + metric_callback = KerasMetricCallback( + metric_fn=compute_metrics, + eval_dataset=tf_eval_dataset, + predict_with_generate=True, + use_xla_generation=True, + generate_kwargs=gen_kwargs, + ) + callbacks = [metric_callback] + else: + callbacks = [] # endregion + # region Preparing push_to_hub and model card + push_to_hub_model_id = training_args.push_to_hub_model_id + model_name = model_args.model_name_or_path.split("/")[-1] + if not push_to_hub_model_id: + push_to_hub_model_id = f"{model_name}-finetuned-{data_args.source_lang}-{data_args.target_lang}" + + model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "translation"} + if data_args.dataset_name is not None: + model_card_kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + model_card_kwargs["dataset_args"] = data_args.dataset_config_name + model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + model_card_kwargs["dataset"] = data_args.dataset_name + + languages = [l for l in [data_args.source_lang, data_args.target_lang] if l is not None] + if len(languages) > 0: + model_card_kwargs["language"] = languages + + if training_args.push_to_hub: + # Because this training can be quite long, we save once per epoch. + callbacks.append( + PushToHubCallback( + output_dir=training_args.output_dir, + model_id=push_to_hub_model_id, + organization=training_args.push_to_hub_organization, + token=training_args.push_to_hub_token, + tokenizer=tokenizer, + **model_card_kwargs, + ) + ) + # endregion + # region Training - model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer) + eval_metrics = None + model.compile(optimizer=optimizer, jit_compile=training_args.xla) if training_args.do_train: logger.info("***** Running training *****") @@ -611,41 +641,48 @@ def main(): logger.info(f" Total train batch size = {total_train_batch_size}") logger.info(f" Total optimization steps = {num_train_steps}") - model.fit( - tf_train_dataset, - epochs=int(training_args.num_train_epochs), - steps_per_epoch=num_update_steps_per_epoch, - ) + if training_args.xla and not data_args.pad_to_max_length: + logger.warning( + "XLA training may be slow at first when --pad_to_max_length is not set " + "until all possible shapes have been compiled." + ) + + history = model.fit(tf_train_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks) + eval_metrics = {key: val[-1] for key, val in history.history.items()} # endregion # region Validation - if data_args.val_max_target_length is None: - data_args.val_max_target_length = data_args.max_target_length + if training_args.do_eval and not training_args.do_train: + # Compiling generation with XLA yields enormous speedups, see https://huggingface.co/blog/tf-xla-generate + @tf.function(jit_compile=True) + def generate(**kwargs): + return model.generate(**kwargs) - gen_kwargs = { - "max_length": data_args.val_max_target_length, - "num_beams": data_args.num_beams, - } - if training_args.do_eval: - logger.info("Evaluation...") - for batch, labels in tqdm( - tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size - ): - batch.update(gen_kwargs) - generated_tokens = model.generate(**batch) - if isinstance(generated_tokens, tuple): - generated_tokens = generated_tokens[0] - decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) - labels = np.where(labels != -100, labels, tokenizer.pad_token_id) - decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) - decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + if training_args.do_eval: + logger.info("Evaluation...") + for batch, labels in tf_eval_dataset: + batch.update(gen_kwargs) + generated_tokens = generate(**batch) + if isinstance(generated_tokens, tuple): + generated_tokens = generated_tokens[0] + decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) - metric.add_batch(predictions=decoded_preds, references=decoded_labels) - eval_metric = metric.compute() - logger.info({"bleu": eval_metric["score"]}) + metric.add_batch(predictions=decoded_preds, references=decoded_labels) + + eval_metrics = metric.compute() + logger.info({"bleu": eval_metrics["score"]}) # endregion - if training_args.output_dir is not None: + if training_args.output_dir is not None and eval_metrics is not None: + output_eval_file = os.path.join(training_args.output_dir, "all_results.json") + with open(output_eval_file, "w") as writer: + writer.write(json.dumps(eval_metrics)) + + if training_args.output_dir is not None and not training_args.push_to_hub: + # If we're not pushing to hub, at least save a local copy when we're done model.save_pretrained(training_args.output_dir) diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py index 345b2eaf1f..e2b2a961ca 100644 --- a/src/transformers/optimization_tf.py +++ b/src/transformers/optimization_tf.py @@ -87,6 +87,8 @@ def create_optimizer( adam_beta1: float = 0.9, adam_beta2: float = 0.999, adam_epsilon: float = 1e-8, + adam_clipnorm: Optional[float] = None, + adam_global_clipnorm: Optional[float] = None, weight_decay_rate: float = 0.0, power: float = 1.0, include_in_weight_decay: Optional[List[str]] = None, @@ -109,6 +111,11 @@ def create_optimizer( The beta2 to use in Adam. adam_epsilon (`float`, *optional*, defaults to 1e-8): The epsilon to use in Adam. + adam_clipnorm: (`float`, *optional*, defaults to `None`): + If not `None`, clip the gradient norm for each weight tensor to this value. + adam_global_clipnorm: (`float`, *optional*, defaults to `None`) + If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all + weight tensors, as if they were concatenated into a single vector. weight_decay_rate (`float`, *optional*, defaults to 0): The weight decay to use. power (`float`, *optional*, defaults to 1.0): @@ -137,12 +144,19 @@ def create_optimizer( beta_1=adam_beta1, beta_2=adam_beta2, epsilon=adam_epsilon, + clipnorm=adam_clipnorm, + global_clipnorm=adam_global_clipnorm, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], include_in_weight_decay=include_in_weight_decay, ) else: optimizer = tf.keras.optimizers.Adam( - learning_rate=lr_schedule, beta_1=adam_beta1, beta_2=adam_beta2, epsilon=adam_epsilon + learning_rate=lr_schedule, + beta_1=adam_beta1, + beta_2=adam_beta2, + epsilon=adam_epsilon, + clipnorm=adam_clipnorm, + global_clipnorm=adam_global_clipnorm, ) # We return the optimizer and the LR scheduler in order to better track the # evolution of the LR independently of the optimizer. diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index e662d6fca4..e9a9f8f004 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -106,6 +106,7 @@ class OptimizerNames(ExplicitEnum): @dataclass class TrainingArguments: + framework = "pt" """ TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop itself**. @@ -1039,25 +1040,25 @@ class TrainingArguments: self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"] if self.run_name is None: self.run_name = self.output_dir - - if self.fp16_backend and self.fp16_backend != "auto": - warnings.warn( - "`fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use" - " `half_precision_backend` instead", - FutureWarning, - ) - self.half_precision_backend = self.fp16_backend - - if self.bf16 or self.bf16_full_eval: - - if self.no_cuda and not is_torch_bf16_cpu_available(): - # cpu - raise ValueError("Your setup doesn't support bf16/cpu. You need torch>=1.10") - elif not self.no_cuda and not is_torch_bf16_gpu_available(): - # gpu - raise ValueError( - "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0" + if self.framework == "pt" and is_torch_available(): + if self.fp16_backend and self.fp16_backend != "auto": + warnings.warn( + "`fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use" + " `half_precision_backend` instead", + FutureWarning, ) + self.half_precision_backend = self.fp16_backend + + if self.bf16 or self.bf16_full_eval: + + if self.no_cuda and not is_torch_bf16_cpu_available(): + # cpu + raise ValueError("Your setup doesn't support bf16/cpu. You need torch>=1.10") + elif not self.no_cuda and not is_torch_bf16_gpu_available(): + # gpu + raise ValueError( + "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0" + ) if self.fp16 and self.bf16: raise ValueError("At most one of fp16 and bf16 can be True, but not both") @@ -1084,7 +1085,8 @@ class TrainingArguments: self.optim = OptimizerNames.ADAFACTOR if ( - is_torch_available() + self.framework == "pt" + and is_torch_available() and (self.device.type != "cuda") and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ) and (self.fp16 or self.fp16_full_eval) @@ -1095,7 +1097,8 @@ class TrainingArguments: ) if ( - is_torch_available() + self.framework == "pt" + and is_torch_available() and (self.device.type != "cuda") and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ) and (self.device.type != "cpu") @@ -1106,7 +1109,7 @@ class TrainingArguments: " (`--bf16_full_eval`) can only be used on CUDA or CPU devices." ) - if is_torch_available() and self.tf32 is not None: + if self.framework == "pt" and is_torch_available() and self.tf32 is not None: if self.tf32: if is_torch_tf32_available(): torch.backends.cuda.matmul.allow_tf32 = True diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index 060b78e922..fdae51f72d 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -28,6 +28,7 @@ if is_tf_available(): @dataclass class TFTrainingArguments(TrainingArguments): + framework = "tf" """ TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop itself**. @@ -188,9 +189,6 @@ class TFTrainingArguments(TrainingArguments): def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", int]: logger.info("Tensorflow: setting up strategy") - if self.xla: - tf.config.optimizer.set_jit(True) - gpus = tf.config.list_physical_devices("GPU") # Set to float16 at first