From 7e22609e0f8ae63b5c223fe6633a01d0800adf00 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Mon, 28 Jun 2021 19:31:44 +0100
Subject: [PATCH] Tensorflow LM examples (#12358)

* Tensorflow MLM example

* Add CLM example

* Style fixes, adding missing checkpoint code from the CLM example

* Fix TPU training, avoid massive dataset warnings

* Fix incorrect training length calculation for multi-GPU training

* Fix incorrect training length calculation for multi-GPU training

* Refactors and nitpicks from the review

* Style pass

* Adding README
---
 .../tensorflow/language-modeling/README.md    |  63 ++
 .../tensorflow/language-modeling/run_clm.py   | 545 ++++++++++++++++
 .../tensorflow/language-modeling/run_mlm.py   | 604 ++++++++++++++++++
 3 files changed, 1212 insertions(+)
 create mode 100644 examples/tensorflow/language-modeling/README.md
 create mode 100755 examples/tensorflow/language-modeling/run_clm.py
 create mode 100755 examples/tensorflow/language-modeling/run_mlm.py

diff --git a/examples/tensorflow/language-modeling/README.md b/examples/tensorflow/language-modeling/README.md
new file mode 100644
index 0000000000..ac1b4a96b8
--- /dev/null
+++ b/examples/tensorflow/language-modeling/README.md
@@ -0,0 +1,63 @@
+<!---
+Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Language modelling examples
+
+This folder contains some scripts showing examples of *language model pre-training* with the 🤗 Transformers library.
+For straightforward use-cases you may be able to use these scripts without modification, although we have also
+included comments in the code to indicate areas that you may need to adapt to your own projects. The two scripts
+have almost identical arguments, but they differ in the type of LM they train - a causal language model (like GPT) or a 
+masked language model (like BERT). Masked language models generally train more quickly and perform better when 
+fine-tuned on new tasks with a task-specific output head, like text classification. However, their ability to generate
+text is weaker than causal language models.
+
+## Pre-training versus fine-tuning
+
+These scripts can be used to both *pre-train* a language model completely from scratch, as well as to *fine-tune*
+a language model on text from your domain of interest. To start with an existing pre-trained language model you
+can use the `--model_name_or_path` argument, or to train from scratch you can use the `--model_type` argument
+to indicate the class of model architecture to initialize.
+
+### Multi-GPU and TPU usage
+
+By default, these scripts use a `MirroredStrategy` and will use multiple GPUs effectively if they are available. TPUs
+can also be used by passing the name of the TPU resource with the `--tpu` argument.
+
+## run_mlm.py
+
+This script trains a masked language model.
+
+### Example command
+```
+python run_mlm.py \
+--model_name_or_path distilbert-base-cased \
+--output_dir output \
+--dataset_name wikitext \
+--dataset_config_name wikitext-103-raw-v1
+```
+
+## run_clm.py
+
+This script trains a causal language model.
+
+### Example command
+```
+python run_clm.py \
+--model_name_or_path distilgpt2 \
+--output_dir output \
+--dataset_name wikitext \
+--dataset_config_name wikitext-103-raw-v1
+```
diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py
new file mode 100755
index 0000000000..57ffb831ae
--- /dev/null
+++ b/examples/tensorflow/language-modeling/run_clm.py
@@ -0,0 +1,545 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...)
+on a text file or a dataset without using HuggingFace Trainer.
+
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=masked-lm
+"""
+# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
+
+# region Imports
+import logging
+import math
+import os
+import random
+import sys
+from dataclasses import dataclass, field
+from functools import partial
+from pathlib import Path
+from typing import Optional
+
+import datasets
+import numpy as np
+import tensorflow as tf
+from datasets import load_dataset
+
+import transformers
+from transformers import (
+    CONFIG_MAPPING,
+    CONFIG_NAME,
+    MODEL_FOR_MASKED_LM_MAPPING,
+    TF2_WEIGHTS_NAME,
+    AutoConfig,
+    AutoTokenizer,
+    HfArgumentParser,
+    TFAutoModelForCausalLM,
+    TFTrainingArguments,
+    create_optimizer,
+    set_seed,
+)
+from transformers.utils.versions import require_version
+
+
+logger = logging.getLogger(__name__)
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+# endregion
+
+
+# region Command-line arguments
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_overrides: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Override some existing default config settings when a model is trained from scratch. Example: "
+            "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
+        },
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+    def __post_init__(self):
+        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
+            raise ValueError(
+                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
+            )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+
+
+# endregion
+
+# region Helper classes
+class SavePretrainedCallback(tf.keras.callbacks.Callback):
+    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
+    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
+    # that saves the model with this method after each epoch.
+    def __init__(self, output_dir, **kwargs):
+        super().__init__()
+        self.output_dir = output_dir
+
+    def on_epoch_end(self, epoch, logs=None):
+        self.model.save_pretrained(self.output_dir)
+
+
+# endregion
+
+# region Data generator
+def sample_generator(dataset, tokenizer):
+    # Trim off the last partial batch if present
+    sample_ordering = np.random.permutation(len(dataset))
+    for sample_idx in sample_ordering:
+        example = dataset[int(sample_idx)]
+        # Handle dicts with proper padding and conversion to tensor.
+        example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int64) for key, arr in example.items()}
+        yield example, example["labels"]  # TF needs some kind of labels, even if we don't use them
+    return
+
+
+# endregion
+
+
+def main():
+    # region Argument Parsing
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Sanity checks
+    if data_args.dataset_name is None and data_args.train_file is None and data_args.validation_file is None:
+        raise ValueError("Need either a dataset name or a training/validation file.")
+    else:
+        if data_args.train_file is not None:
+            extension = data_args.train_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
+        if data_args.validation_file is not None:
+            extension = data_args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
+
+    if training_args.output_dir is not None:
+        training_args.output_dir = Path(training_args.output_dir)
+        os.makedirs(training_args.output_dir, exist_ok=True)
+
+    if isinstance(training_args.strategy, tf.distribute.TPUStrategy) and not data_args.pad_to_max_length:
+        logger.warning("We are training on TPU - forcing pad_to_max_length")
+        data_args.pad_to_max_length = True
+    # endregion
+
+    # region Checkpoints
+    # Detecting last checkpoint.
+    checkpoint = None
+    if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir:
+        config_path = training_args.output_dir / CONFIG_NAME
+        weights_path = training_args.output_dir / TF2_WEIGHTS_NAME
+        if config_path.is_file() and weights_path.is_file():
+            checkpoint = training_args.output_dir
+            logger.info(
+                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
+                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+        else:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to continue regardless."
+            )
+
+    # endregion
+
+    # region Setup logging
+    # accelerator.is_local_main_process is only True for one process per machine.
+    logger.setLevel(logging.INFO)
+    datasets.utils.logging.set_verbosity_warning()
+    transformers.utils.logging.set_verbosity_info()
+    # endregion
+
+    # If passed along, set the training seed now.
+    if training_args.seed is not None:
+        set_seed(training_args.seed)
+
+    # region Load datasets
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+            )
+            raw_datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+            )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        raw_datasets = load_dataset(extension, data_files=data_files)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # endregion
+
+    # region Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name)
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+    # endregion
+
+    # region Dataset preprocessing
+    # First we tokenize all the texts.
+    column_names = raw_datasets["train"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    if data_args.max_seq_length is None:
+        max_seq_length = tokenizer.model_max_length
+        if max_seq_length > 1024:
+            logger.warning(
+                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
+            )
+            max_seq_length = 1024
+    else:
+        if data_args.max_seq_length > tokenizer.model_max_length:
+            logger.warning(
+                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+            )
+        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    # First we tokenize all the texts.
+    column_names = raw_datasets["train"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    def tokenize_function(examples):
+        return tokenizer(examples[text_column_name])
+
+    tokenized_datasets = raw_datasets.map(
+        tokenize_function,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not data_args.overwrite_cache,
+        desc="Running tokenizer on dataset",
+    )
+
+    block_size = tokenizer.model_max_length
+    if block_size > 1024:
+        logger.warning(
+            f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+            "Picking 1024 instead. You can reduce that value by passing --block_size xxx."
+        )
+        block_size = 1024
+
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        result["labels"] = result["input_ids"].copy()
+        return result
+
+    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
+    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
+    # to preprocess.
+    #
+    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+
+    lm_datasets = tokenized_datasets.map(
+        group_texts,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        load_from_cache_file=not data_args.overwrite_cache,
+        desc=f"Grouping texts in chunks of {block_size}",
+    )
+
+    train_dataset = lm_datasets["train"]
+    eval_dataset = lm_datasets["validation"]
+
+    if data_args.max_train_samples is not None:
+        train_dataset = train_dataset.select(range(data_args.max_train_samples))
+    if data_args.max_eval_samples is not None:
+        eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+    # endregion
+
+    with training_args.strategy.scope():
+        # region Prepare model
+        if checkpoint is not None:
+            model = TFAutoModelForCausalLM.from_pretrained(checkpoint, config=config)
+        elif model_args.model_name_or_path:
+            model = TFAutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, config=config)
+        else:
+            logger.info("Training new model from scratch")
+            model = TFAutoModelForCausalLM.from_config(config)
+
+        model.resize_token_embeddings(len(tokenizer))
+        # endregion
+
+        # region TF Dataset preparation
+        num_replicas = training_args.strategy.num_replicas_in_sync
+        train_generator = partial(sample_generator, train_dataset, tokenizer)
+        train_signature = {
+            feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
+            for feature in train_dataset.features
+            if feature != "special_tokens_mask"
+        }
+        train_sig = (train_signature, train_signature["labels"])
+        options = tf.data.Options()
+        options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+        tf_train_dataset = (
+            tf.data.Dataset.from_generator(train_generator, output_signature=train_sig)
+            .with_options(options)
+            .batch(batch_size=num_replicas * training_args.per_device_train_batch_size, drop_remainder=True)
+            .repeat(int(training_args.num_train_epochs))
+        )
+        eval_generator = partial(sample_generator, eval_dataset, tokenizer)
+        eval_signature = {
+            feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
+            for feature in eval_dataset.features
+            if feature != "special_tokens_mask"
+        }
+        eval_sig = (eval_signature, eval_signature["labels"])
+        tf_eval_dataset = (
+            tf.data.Dataset.from_generator(eval_generator, output_signature=eval_sig)
+            .with_options(options)
+            .batch(batch_size=num_replicas * training_args.per_device_eval_batch_size, drop_remainder=True)
+            .repeat(int(training_args.num_train_epochs))
+        )
+        # endregion
+
+        # region Optimizer and loss
+        batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
+        # Bias and layernorm weights are automatically excluded from the decay
+        optimizer, lr_schedule = create_optimizer(
+            init_lr=training_args.learning_rate,
+            num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
+            num_warmup_steps=training_args.warmup_steps,
+            adam_beta1=training_args.adam_beta1,
+            adam_beta2=training_args.adam_beta2,
+            adam_epsilon=training_args.adam_epsilon,
+            weight_decay_rate=training_args.weight_decay,
+        )
+
+        def dummy_loss(y_true, y_pred):
+            return tf.reduce_mean(y_pred)
+
+        model.compile(optimizer=optimizer, loss={"loss": dummy_loss})
+        # endregion
+
+        # region Training and validation
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {len(train_dataset)}")
+        logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
+        logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
+        logger.info(f"  Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
+
+        history = model.fit(
+            tf_train_dataset,
+            validation_data=tf_eval_dataset,
+            epochs=int(training_args.num_train_epochs),
+            steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
+            callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
+        )
+        try:
+            train_perplexity = math.exp(history.history["loss"][-1])
+        except OverflowError:
+            train_perplexity = math.inf
+        try:
+            validation_perplexity = math.exp(history.history["val_loss"][-1])
+        except OverflowError:
+            validation_perplexity = math.inf
+        logger.info(f"  Final train loss: {history.history['loss'][-1]:.3f}")
+        logger.info(f"  Final train perplexity: {train_perplexity:.3f}")
+        logger.info(f"  Final validation loss: {history.history['val_loss'][-1]:.3f}")
+        logger.info(f"  Final validation perplexity: {validation_perplexity:.3f}")
+        # endregion
+
+        if training_args.output_dir is not None:
+            model.save_pretrained(training_args.output_dir)
+
+    if training_args.push_to_hub:
+        # You'll probably want to include some of your own metadata here!
+        model.push_to_hub()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py
new file mode 100755
index 0000000000..c82a662006
--- /dev/null
+++ b/examples/tensorflow/language-modeling/run_mlm.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...)
+on a text file or a dataset without using HuggingFace Trainer.
+
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=masked-lm
+"""
+# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
+
+# TODO Do multi-GPU and TPU tests and make sure the dataset length works as expected
+# TODO Duplicate all changes over to the CLM script
+
+import logging
+import math
+import os
+import random
+import sys
+from dataclasses import dataclass, field
+from functools import partial
+from pathlib import Path
+from typing import Optional
+
+import datasets
+import numpy as np
+import tensorflow as tf
+from datasets import load_dataset
+
+import transformers
+from transformers import (
+    CONFIG_MAPPING,
+    CONFIG_NAME,
+    MODEL_FOR_MASKED_LM_MAPPING,
+    TF2_WEIGHTS_NAME,
+    AutoConfig,
+    AutoTokenizer,
+    HfArgumentParser,
+    TFAutoModelForMaskedLM,
+    TFTrainingArguments,
+    create_optimizer,
+    set_seed,
+)
+from transformers.utils.versions import require_version
+
+
+logger = logging.getLogger(__name__)
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+# region Command-line arguments
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_overrides: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Override some existing default config settings when a model is trained from scratch. Example: "
+            "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
+        },
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+    def __post_init__(self):
+        if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
+            raise ValueError(
+                "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
+            )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+
+
+# endregion
+
+
+# region Helper classes
+class SavePretrainedCallback(tf.keras.callbacks.Callback):
+    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
+    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
+    # that saves the model with this method after each epoch.
+    def __init__(self, output_dir, **kwargs):
+        super().__init__()
+        self.output_dir = output_dir
+
+    def on_epoch_end(self, epoch, logs=None):
+        self.model.save_pretrained(self.output_dir)
+
+
+# endregion
+
+# region Data generator
+def sample_generator(dataset, tokenizer, mlm_probability=0.15, pad_to_multiple_of=None):
+    if tokenizer.mask_token is None:
+        raise ValueError("This tokenizer does not have a mask token which is necessary for masked language modeling. ")
+    # Trim off the last partial batch if present
+    sample_ordering = np.random.permutation(len(dataset))
+    for sample_idx in sample_ordering:
+        example = dataset[int(sample_idx)]
+        # Handle dicts with proper padding and conversion to tensor.
+        example = tokenizer.pad(example, return_tensors="np", pad_to_multiple_of=pad_to_multiple_of)
+        special_tokens_mask = example.pop("special_tokens_mask", None)
+        example["input_ids"], example["labels"] = mask_tokens(
+            example["input_ids"], mlm_probability, tokenizer, special_tokens_mask=special_tokens_mask
+        )
+        if tokenizer.pad_token_id is not None:
+            example["labels"][example["labels"] == tokenizer.pad_token_id] = -100
+        example = {key: tf.convert_to_tensor(arr) for key, arr in example.items()}
+
+        yield example, example["labels"]  # TF needs some kind of labels, even if we don't use them
+    return
+
+
+def mask_tokens(inputs, mlm_probability, tokenizer, special_tokens_mask):
+    """
+    Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+    """
+    labels = np.copy(inputs)
+    # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+    probability_matrix = np.random.random_sample(labels.shape)
+    special_tokens_mask = special_tokens_mask.astype(np.bool_)
+
+    probability_matrix[special_tokens_mask] = 0.0
+    masked_indices = probability_matrix > (1 - mlm_probability)
+    labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+    indices_replaced = (np.random.random_sample(labels.shape) < 0.8) & masked_indices
+    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
+
+    # 10% of the time, we replace masked input tokens with random word
+    indices_random = (np.random.random_sample(labels.shape) < 0.5) & masked_indices & ~indices_replaced
+    random_words = np.random.randint(low=0, high=len(tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64)
+    inputs[indices_random] = random_words
+
+    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+    return inputs, labels
+
+
+# endregion
+
+
+def main():
+    # region Argument Parsing
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Sanity checks
+    if data_args.dataset_name is None and data_args.train_file is None and data_args.validation_file is None:
+        raise ValueError("Need either a dataset name or a training/validation file.")
+    else:
+        if data_args.train_file is not None:
+            extension = data_args.train_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
+        if data_args.validation_file is not None:
+            extension = data_args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
+
+    if training_args.output_dir is not None:
+        training_args.output_dir = Path(training_args.output_dir)
+        os.makedirs(training_args.output_dir, exist_ok=True)
+
+    if isinstance(training_args.strategy, tf.distribute.TPUStrategy) and not data_args.pad_to_max_length:
+        logger.warning("We are training on TPU - forcing pad_to_max_length")
+        data_args.pad_to_max_length = True
+    # endregion
+
+    # region Checkpoints
+    # Detecting last checkpoint.
+    checkpoint = None
+    if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir:
+        config_path = training_args.output_dir / CONFIG_NAME
+        weights_path = training_args.output_dir / TF2_WEIGHTS_NAME
+        if config_path.is_file() and weights_path.is_file():
+            checkpoint = training_args.output_dir
+            logger.warning(
+                f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this"
+                " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+        else:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to continue regardless."
+            )
+
+    # endregion
+
+    # region Setup logging
+    # accelerator.is_local_main_process is only True for one process per machine.
+    logger.setLevel(logging.INFO)
+    datasets.utils.logging.set_verbosity_warning()
+    transformers.utils.logging.set_verbosity_info()
+    # endregion
+
+    # If passed along, set the training seed now.
+    if training_args.seed is not None:
+        set_seed(training_args.seed)
+
+    # region Load datasets
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+            )
+            raw_datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+            )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        raw_datasets = load_dataset(extension, data_files=data_files)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # endregion
+
+    # region Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if checkpoint is not None:
+        config = AutoConfig.from_pretrained(checkpoint)
+    elif model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name)
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+    # endregion
+
+    # region Dataset preprocessing
+    # First we tokenize all the texts.
+    column_names = raw_datasets["train"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    if data_args.max_seq_length is None:
+        max_seq_length = tokenizer.model_max_length
+        if max_seq_length > 1024:
+            logger.warning(
+                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+                "Picking 1024 instead. You can reduce that default value by passing --max_seq_length xxx."
+            )
+            max_seq_length = 1024
+    else:
+        if data_args.max_seq_length > tokenizer.model_max_length:
+            logger.warning(
+                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+            )
+        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    if data_args.line_by_line:
+        # When using line_by_line, we just tokenize each nonempty line.
+        padding = "max_length" if data_args.pad_to_max_length else False
+
+        def tokenize_function(examples):
+            # Remove empty lines
+            examples[text_column_name] = [
+                line for line in examples[text_column_name] if len(line) > 0 and not line.isspace()
+            ]
+            return tokenizer(
+                examples[text_column_name],
+                padding=padding,
+                truncation=True,
+                max_length=max_seq_length,
+                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
+                # receives the `special_tokens_mask`.
+                return_special_tokens_mask=True,
+            )
+
+        tokenized_datasets = raw_datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=[text_column_name],
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc="Running tokenizer on dataset line_by_line",
+        )
+    else:
+        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+        # efficient when it receives the `special_tokens_mask`.
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+
+        tokenized_datasets = raw_datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc="Running tokenizer on every text in dataset",
+        )
+
+        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+        # max_seq_length.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            total_length = (total_length // max_seq_length) * max_seq_length
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+
+        tokenized_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc=f"Grouping texts in chunks of {max_seq_length}",
+        )
+
+    train_dataset = tokenized_datasets["train"]
+    if data_args.max_train_samples is not None:
+        train_dataset = train_dataset.select(range(data_args.max_train_samples))
+    eval_dataset = tokenized_datasets["validation"]
+    if data_args.max_eval_samples is not None:
+        eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+    # endregion
+
+    with training_args.strategy.scope():
+        # region Prepare model
+        if checkpoint is not None:
+            model = TFAutoModelForMaskedLM.from_pretrained(checkpoint, config=config)
+        elif model_args.model_name_or_path:
+            model = TFAutoModelForMaskedLM.from_pretrained(model_args.model_name_or_path, config=config)
+        else:
+            logger.info("Training new model from scratch")
+            model = TFAutoModelForMaskedLM.from_config(config)
+
+        model.resize_token_embeddings(len(tokenizer))
+        # endregion
+
+        # region TF Dataset preparation
+        num_replicas = training_args.strategy.num_replicas_in_sync
+        train_generator = partial(sample_generator, train_dataset, tokenizer)
+        train_signature = {
+            feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
+            for feature in train_dataset.features
+            if feature != "special_tokens_mask"
+        }
+        train_signature["labels"] = train_signature["input_ids"]
+        train_signature = (train_signature, train_signature["labels"])
+        options = tf.data.Options()
+        options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+        tf_train_dataset = (
+            tf.data.Dataset.from_generator(train_generator, output_signature=train_signature)
+            .with_options(options)
+            .batch(batch_size=num_replicas * training_args.per_device_train_batch_size, drop_remainder=True)
+            .repeat(int(training_args.num_train_epochs))
+        )
+        eval_generator = partial(sample_generator, eval_dataset, tokenizer)
+        eval_signature = {
+            feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
+            for feature in eval_dataset.features
+            if feature != "special_tokens_mask"
+        }
+        eval_signature["labels"] = eval_signature["input_ids"]
+        eval_signature = (eval_signature, eval_signature["labels"])
+        tf_eval_dataset = (
+            tf.data.Dataset.from_generator(eval_generator, output_signature=eval_signature)
+            .with_options(options)
+            .batch(batch_size=num_replicas * training_args.per_device_eval_batch_size, drop_remainder=True)
+        )
+        # endregion
+
+        # region Optimizer and loss
+        batches_per_epoch = len(train_dataset) // (num_replicas * training_args.per_device_train_batch_size)
+        # Bias and layernorm weights are automatically excluded from the decay
+        optimizer, lr_schedule = create_optimizer(
+            init_lr=training_args.learning_rate,
+            num_train_steps=int(training_args.num_train_epochs * batches_per_epoch),
+            num_warmup_steps=training_args.warmup_steps,
+            adam_beta1=training_args.adam_beta1,
+            adam_beta2=training_args.adam_beta2,
+            adam_epsilon=training_args.adam_epsilon,
+            weight_decay_rate=training_args.weight_decay,
+        )
+
+        def dummy_loss(y_true, y_pred):
+            return tf.reduce_mean(y_pred)
+
+        model.compile(optimizer=optimizer, loss={"loss": dummy_loss})
+        # endregion
+
+        # region Training and validation
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {len(train_dataset)}")
+        logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
+        logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
+        logger.info(f"  Total train batch size = {training_args.per_device_train_batch_size * num_replicas}")
+
+        history = model.fit(
+            tf_train_dataset,
+            validation_data=tf_eval_dataset,
+            epochs=int(training_args.num_train_epochs),
+            steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas),
+            callbacks=[SavePretrainedCallback(output_dir=training_args.output_dir)],
+        )
+        try:
+            train_perplexity = math.exp(history.history["loss"][-1])
+        except OverflowError:
+            train_perplexity = math.inf
+        try:
+            validation_perplexity = math.exp(history.history["val_loss"][-1])
+        except OverflowError:
+            validation_perplexity = math.inf
+        logger.warning(f"  Final train loss: {history.history['loss'][-1]:.3f}")
+        logger.warning(f"  Final train perplexity: {train_perplexity:.3f}")
+        logger.warning(f"  Final validation loss: {history.history['val_loss'][-1]:.3f}")
+        logger.warning(f"  Final validation perplexity: {validation_perplexity:.3f}")
+        # endregion
+
+        if training_args.output_dir is not None:
+            model.save_pretrained(training_args.output_dir)
+
+    if training_args.push_to_hub:
+        # You'll probably want to append some of your own metadata here!
+        model.push_to_hub()
+
+
+if __name__ == "__main__":
+    main()