Benchmarks (#4912)

* finish benchmark * fix isort * fix setup cfg * retab * fix time measuring of tf graph mode * fix tf cuda * clean code * better error message
2020-06-22 12:06:56 +02:00
parent 18a0150bfa
commit fa0be6d761
18 changed files with 1040 additions and 363 deletions
--- a/examples/benchmarking/run_benchmark.py
+++ b/examples/benchmarking/run_benchmark.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
+# Copyright 2020 The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
--- a/examples/benchmarking/run_benchmark_tf.py
+++ b/examples/benchmarking/run_benchmark_tf.py
@@ -0,0 +1,29 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Benchmarking the library on inference and training in Tensorflow"""
+
+from transformers import HfArgumentParser, TensorflowBenchmark, TensorflowBenchmarkArguments
+
+
+def main():
+    parser = HfArgumentParser(TensorflowBenchmarkArguments)
+    benchmark_args = parser.parse_args_into_dataclasses()[0]
+    benchmark = TensorflowBenchmark(args=benchmark_args)
+    benchmark.run()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/longform-qa/eli5_app.py
+++ b/examples/longform-qa/eli5_app.py
@@ -1,11 +1,11 @@
-import numpy as np
-import torch
-
 import faiss
 import nlp
+import numpy as np
+import torch
+from elasticsearch import Elasticsearch
+
 import streamlit as st
 import transformers
-from elasticsearch import Elasticsearch
 from eli5_utils import (
    embed_questions_for_retrieval,
    make_qa_s2s_model,
--- a/examples/longform-qa/eli5_utils.py
+++ b/examples/longform-qa/eli5_utils.py
@@ -4,17 +4,17 @@ import os  # noqa: F401
 from random import choice, randint
 from time import time

+import faiss  # noqa: F401
+import nlp  # noqa: F401
 import numpy as np
 import pandas as pd
 import torch
 import torch.utils.checkpoint as checkpoint
+from elasticsearch import Elasticsearch  # noqa: F401
+from elasticsearch.helpers import bulk, streaming_bulk  # noqa: F401
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from tqdm import tqdm

-import faiss  # noqa: F401
-import nlp  # noqa: F401
-from elasticsearch import Elasticsearch  # noqa: F401
-from elasticsearch.helpers import bulk, streaming_bulk  # noqa: F401
 from transformers import AdamW, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup


--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -8,3 +8,8 @@ tensorflow_datasets
 pytorch-lightning==0.7.6
 matplotlib
 git-python==1.0.3
+faiss
+streamlit
+elasticsearch
+pandas
+nlp
--- a/setup.cfg
+++ b/setup.cfg
@@ -5,12 +5,15 @@ include_trailing_comma = True
 known_first_party = transformers
 known_third_party =
    absl
+    elasticsearch
    fairseq
+    faiss
    fastprogress
    git
    h5py
    matplotlib
    MeCab
+    nlp
    nltk
    numpy
    packaging
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -78,6 +78,9 @@ from .file_utils import (
    add_end_docstrings,
    add_start_docstrings,
    cached_path,
+    is_apex_available,
+    is_psutil_available,
+    is_py3nvml_available,
    is_tf_available,
    is_torch_available,
    is_torch_tpu_available,
@@ -398,7 +401,8 @@ if is_torch_available():
    from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments

    # Benchmarks
-    from .benchmark import PyTorchBenchmark, PyTorchBenchmarkArguments
+    from .benchmark.benchmark import PyTorchBenchmark
+    from .benchmark.benchmark_args import PyTorchBenchmarkArguments

 # TensorFlow
 if is_tf_available():
@@ -608,6 +612,10 @@ if is_tf_available():
    # Trainer
    from .trainer_tf import TFTrainer

+    # Benchmarks
+    from .benchmark.benchmark_tf import TensorflowBenchmark
+    from .benchmark.benchmark_args_tf import TensorflowBenchmarkArguments
+

 if not is_tf_available() and not is_torch_available():
    logger.warning(
--- a/src/transformers/benchmark/init.py
+++ b/src/transformers/benchmark/init.py
@@ -1,10 +0,0 @@
-# flake8: noqa
-# There's no way to ignore "F401 '...' imported but unused" warnings in this
-# module, but to preserve other warnings. So, don't check this module at all.
-
-from ..file_utils import is_torch_available
-
-
-if is_torch_available():
-    from .benchmark_args import PyTorchBenchmarkArguments
-    from .benchmark import PyTorchBenchmark
--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -20,16 +20,24 @@

 import logging
 import timeit
+from typing import Callable, Optional

 from transformers import (
    MODEL_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    PretrainedConfig,
+    is_py3nvml_available,
    is_torch_available,
-    is_torch_tpu_available,
 )

-from .benchmark_utils import Benchmark, Memory, measure_peak_memory_cpu, start_memory_tracing, stop_memory_tracing
+from .benchmark_utils import (
+    Benchmark,
+    Memory,
+    MemorySummary,
+    measure_peak_memory_cpu,
+    start_memory_tracing,
+    stop_memory_tracing,
+)


 if is_torch_available():
@@ -37,6 +45,10 @@ if is_torch_available():
    from .benchmark_args import PyTorchBenchmarkArguments


+if is_py3nvml_available():
+    import py3nvml.py3nvml as nvml
+
+
 logger = logging.getLogger(__name__)


@@ -50,28 +62,93 @@ class PyTorchBenchmark(Benchmark):
    def framework_version(self):
        return torch.__version__

-    def train(self, model_name, batch_size, sequence_length, trace_memory=False):
-        try:
+    def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
+        _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
+        return self._measure_speed(_inference)
+
+    def _inference_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
+        _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
+        return self._measure_memory(_inference)
+
+    def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
+        _train = self._prepare_train_func(model_name, batch_size, sequence_length)
+        return self._measure_speed(_train)
+
+    def _train_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
+        _train = self._prepare_train_func(model_name, batch_size, sequence_length)
+        return self._measure_memory(_train)
+
+    def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
        config = self.config_dict[model_name]

        if self.args.torchscript:
            config.torchscript = True
-
+        if self.args.with_lm_head:
            model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
+        else:
+            model = MODEL_MAPPING[config.__class__](config)
+
+        model.eval()
        model.to(self.args.device)
-            model.train()

        # encoder-decoder has vocab size saved differently
        vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
-            input_ids = torch.randint(
-                vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device
-            )
+        input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)
+
+        if self.args.fp16:
+            logger.info("Running training in Mixed Precision...")
+            assert self.args.is_gpu, "Mixed precision is possible only for GPU."
+            # amp seems to have memory leaks so that memory usage
+            # is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
+            model.half()
+
+        if self.args.torchscript:
+            with torch.no_grad():
+                inference_model = torch.jit.trace(model, input_ids)
+        else:
+            inference_model = model
+
+        def encoder_decoder_forward():
+            with torch.no_grad():
+                outputs = inference_model(input_ids, decoder_input_ids=input_ids)
+            return outputs
+
+        def encoder_forward():
+            with torch.no_grad():
+                outputs = inference_model(input_ids)
+            return outputs
+
+        _forward = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
+        return _forward
+
+    def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
+        config = self.config_dict[model_name]
+        model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)

        if self.args.torchscript:
            raise NotImplementedError("Training for torchscript is currently not implemented")
        else:
            train_model = model

+        model.eval()
+        model.to(self.args.device)
+
+        # encoder-decoder has vocab size saved differently
+        vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
+        input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device)
+
+        if self.args.fp16:
+            logger.info("Running training in Mixed Precision...")
+            assert self.args.is_gpu, "Mixed precision is possible only for GPU."
+
+            # amp seems to have memory leaks so that memory usage
+            # is measured using .half() for now https://github.com/NVIDIA/apex/issues/439
+            model.half()
+
        def compute_loss_and_backprob_encoder():
            loss = train_model(input_ids, labels=input_ids)[0]
            loss.backward()
@@ -87,143 +164,63 @@ class PyTorchBenchmark(Benchmark):
            if config.is_encoder_decoder
            else compute_loss_and_backprob_encoder
        )
+        return _train

-            if trace_memory is True:
-                if self.args.trace_memory_line_by_line:
-                    trace = start_memory_tracing("transformers")
-
-                if self.args.n_gpu > 0:
-                    # gpu
-                    # clear gpu cache
-                    torch.cuda.empty_cache()
-                    if hasattr(torch.cuda, "max_memory_reserved"):
-                        torch.cuda.reset_peak_memory_stats()
-                    else:
-                        logger.info(
-                            "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
-                        )
-                        torch.cuda.reset_max_memory_cached()
-
-                    # calculate loss and do backpropagation
-                    _train()
-                elif not self.args.no_tpu and is_torch_tpu_available():
-                    # tpu
-                    raise NotImplementedError(
-                        "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`"
-                    )
-                else:
-                    # cpu
-                    memory_bytes = measure_peak_memory_cpu(_train)
-                    memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
-
-                if self.args.trace_memory_line_by_line:
-                    summary = stop_memory_tracing(trace)
-                else:
-                    summary = None
-
-                if self.args.n_gpu > 0:
-                    # gpu
-                    if hasattr(torch.cuda, "max_memory_reserved"):
-                        memory = Memory(torch.cuda.max_memory_reserved())
-                    else:
-                        logger.info(
-                            "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
-                        )
-                        memory = Memory(torch.cuda.max_memory_reserved())
-
-                return memory, summary
-            else:
-                if (not self.args.no_tpu and is_torch_tpu_available()) or self.args.torchscript:
-                    # run additional 10 times to stabilize compilation for tpu and torchscript
-                    logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
-                    timeit.repeat(
-                        _train, repeat=1, number=5,
-                    )
-
-                # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-                runtimes = timeit.repeat(_train, repeat=self.args.repeat, number=10,)
-
-                if not self.args.no_tpu and is_torch_tpu_available() and self.args.tpu_print_metrics:
-                    import torch_xla.debug.metrics as met
-
-                    self.print_fn(met.metrics_report())
-
-                return min(runtimes) / 10.0
-        except RuntimeError as e:
-            self.print_fn("Doesn't fit on GPU. {}".format(e))
-            if trace_memory:
-                return "N/A", None
-            else:
-                return "N/A"
-
-    def inference(self, model_name, batch_size, sequence_length, trace_memory=False):
+    def _measure_speed(self, func) -> float:
        try:
-            config = self.config_dict[model_name]
-            model = None
-
-            if self.args.torchscript:
-                config.torchscript = True
-
-            if self.args.with_lm_head:
-                model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
-            else:
-                model = MODEL_MAPPING[config.__class__](config)
-
-            model.eval()
-            model.to(self.args.device)
-
-            # encoder-decoder has vocab size saved differently
-            vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
-
-            input_ids = torch.randint(
-                vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device
+            if self.args.is_tpu or self.args.torchscript:
+                # run additional 10 times to stabilize compilation for tpu and torchscript
+                logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
+                timeit.repeat(
+                    func, repeat=1, number=5,
                )

-            if self.args.torchscript:
-                with torch.no_grad():
-                    if config.is_encoder_decoder:
-                        raise NotImplementedError("Torchscript is currently not supported for EncoderDecoder models")
-                    else:
-                        inference_model = torch.jit.trace(model, input_ids)
-            else:
-                inference_model = model
+            # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
+            runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)

-            def encoder_decoder_forward():
-                with torch.no_grad():
-                    inference_model(input_ids, decoder_input_ids=input_ids)
+            if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
+                import torch_xla.debug.metrics as met

-            def encoder_forward():
-                with torch.no_grad():
-                    inference_model(input_ids)
+                self.print_fn(met.metrics_report())

-            _forward = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
+            return min(runtimes) / 10.0
+        except RuntimeError as e:
+            self.print_fn("Doesn't fit on GPU. {}".format(e))
+            return "N/A"

-            if trace_memory is True:
+    def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
+        try:
            if self.args.trace_memory_line_by_line:
                trace = start_memory_tracing("transformers")

-                if self.args.n_gpu > 0:
-                    # gpu
-                    # clear gpu cache
-                    torch.cuda.empty_cache()
-                    if hasattr(torch.cuda, "max_memory_reserved"):
-                        torch.cuda.reset_peak_memory_stats()
-                    else:
-                        logger.info(
-                            "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
-                        )
-                        torch.cuda.reset_max_memory_cached()
-
-                    # run forward
-                    _forward()
-                elif not self.args.no_tpu and is_torch_tpu_available():
+            if self.args.is_tpu:
                # tpu
                raise NotImplementedError(
-                        "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`"
+                    "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `--no_memory` or `args.no_memory=True`"
                )
+            elif self.args.is_gpu:
+                if not is_py3nvml_available():
+                    logger.warning(
+                        "py3nvml not installed, we won't log GPU memory usage. "
+                        "Install py3nvml (pip install py3nvml) to log information about GPU."
+                    )
+                    memory = "N/A"
+                else:
+                    logger.info(
+                        "Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU."
+                    )
+                    # init nvml
+                    nvml.nvmlInit()
+                    func()
+                    handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
+                    meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
+                    max_bytes_in_use = meminfo.used
+                    memory = Memory(max_bytes_in_use)
+                    # shutdown nvml
+                    nvml.nvmlShutdown()
            else:
                # cpu
-                    memory_bytes = measure_peak_memory_cpu(_forward)
+                memory_bytes = measure_peak_memory_cpu(func)
                memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes

            if self.args.trace_memory_line_by_line:
@@ -231,39 +228,7 @@ class PyTorchBenchmark(Benchmark):
            else:
                summary = None

-                if self.args.n_gpu > 0:
-                    # gpu
-                    if hasattr(torch.cuda, "max_memory_reserved"):
-                        memory = Memory(torch.cuda.max_memory_reserved())
-                    else:
-                        logger.info(
-                            "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage"
-                        )
-                        memory = Memory(torch.cuda.max_memory_cached())
-
            return memory, summary
-            else:
-
-                if (not self.args.no_tpu and is_torch_tpu_available()) or self.args.torchscript:
-                    # run additional 10 times to stabilize compilation for tpu and torchscript
-                    logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
-                    timeit.repeat(
-                        _forward, repeat=1, number=5,
-                    )
-
-                # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
-                runtimes = timeit.repeat(_forward, repeat=self.args.repeat, number=10,)
-
-                if not self.args.no_tpu and is_torch_tpu_available() and self.args.tpu_print_metrics:
-                    import torch_xla.debug.metrics as met
-
-                    self.print_fn(met.metrics_report())
-
-                return min(runtimes) / 10.0
-
        except RuntimeError as e:
            self.print_fn("Doesn't fit on GPU. {}".format(e))
-            if trace_memory:
            return "N/A", None
-            else:
-                return "N/A"
--- a/src/transformers/benchmark/benchmark_args.py
+++ b/src/transformers/benchmark/benchmark_args.py
@@ -34,11 +34,17 @@ logger = logging.getLogger(__name__)

@dataclass
 class PyTorchBenchmarkArguments(BenchmarkArguments):
-    no_cuda: bool = field(default=False, metadata={"help": "Whether to run on available cuda devices"})
    torchscript: bool = field(default=False, metadata={"help": "Trace the models using torchscript"})
-    no_tpu: bool = field(default=False, metadata={"help": "Whether to run on available tpu devices"})
-    fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
-    tpu_print_metrics: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
+    torch_xla_tpu_print_metrics: bool = field(default=False, metadata={"help": "Print Xla/PyTorch tpu metrics"})
+    fp16_opt_level: str = field(
+        default="O1",
+        metadata={
+            "help": (
+                "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                "See details at https://nvidia.github.io/apex/amp.html"
+            )
+        },
+    )

    @cached_property
    @torch_required
@@ -55,9 +61,14 @@ class PyTorchBenchmarkArguments(BenchmarkArguments):
            n_gpu = torch.cuda.device_count()
        return device, n_gpu

+    @property
+    def is_tpu(self):
+        return is_torch_tpu_available() and not self.no_tpu
+
    @property
    @torch_required
    def device_idx(self) -> int:
+        # TODO(PVP): currently only single GPU is supported
        return torch.cuda.current_device()

    @property
@@ -69,3 +80,7 @@ class PyTorchBenchmarkArguments(BenchmarkArguments):
    @torch_required
    def n_gpu(self):
        return self._setup_devices[1]
+
+    @property
+    def is_gpu(self):
+        return self.n_gpu > 0
--- a/src/transformers/benchmark/benchmark_args_tf.py
+++ b/src/transformers/benchmark/benchmark_args_tf.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from dataclasses import dataclass, field
+from typing import Tuple
+
+from ..file_utils import cached_property, is_tf_available, tf_required
+from .benchmark_args_utils import BenchmarkArguments
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TensorflowBenchmarkArguments(BenchmarkArguments):
+    tpu_name: str = field(
+        default=None, metadata={"help": "Name of TPU"},
+    )
+    device_idx: int = field(
+        default=0, metadata={"help": "CPU / GPU device index. Defaults to 0."},
+    )
+    eager_mode: bool = field(default=False, metadata={"help": "Benchmark models in eager model."})
+    use_xla: bool = field(
+        default=False,
+        metadata={
+            "help": "Benchmark models using XLA JIT compilation. Note that `eager_model` has to be set to `False`."
+        },
+    )
+
+    @cached_property
+    @tf_required
+    def _setup_tpu(self) -> Tuple["tf.distribute.cluster_resolver.TPUClusterResolver"]:
+        if not self.no_tpu:
+            try:
+                if self.tpu_name:
+                    tpu = tf.distribute.cluster_resolver.TPUClusterResolver(self.tpu_name)
+                else:
+                    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
+            except ValueError:
+                tpu = None
+        return tpu
+
+    @cached_property
+    @tf_required
+    def _setup_strategy(self) -> Tuple["tf.distribute.Strategy", "tf.distribute.cluster_resolver.TPUClusterResolver"]:
+        if self.is_tpu:
+            tf.config.experimental_connect_to_cluster(self._setup_tpu)
+            tf.tpu.experimental.initialize_tpu_system(self._setup_tpu)
+
+            strategy = tf.distribute.experimental.TPUStrategy(self._setup_tpu)
+        else:
+            # currently no multi gpu is allowed
+            if self.is_gpu:
+                # TODO: Currently only single GPU is supported
+                tf.config.experimental.set_visible_devices(self.gpu_list[self.device_idx], "GPU")
+                strategy = tf.distribute.OneDeviceStrategy(device=f"/gpu:{self.device_idx}")
+            else:
+                tf.config.experimental.set_visible_devices([], "GPU")  # disable GPU
+                strategy = tf.distribute.OneDeviceStrategy(device=f"/cpu:{self.device_idx}")
+
+        return strategy
+
+    @property
+    @tf_required
+    def is_tpu(self) -> bool:
+        return self._setup_tpu is not None
+
+    @property
+    @tf_required
+    def strategy(self) -> "tf.distribute.Strategy":
+        return self._setup_strategy
+
+    @property
+    @tf_required
+    def gpu_list(self):
+        return tf.config.list_physical_devices("GPU")
+
+    @property
+    @tf_required
+    def n_gpu(self) -> int:
+        if not self.no_cuda:
+            return len(self.gpu_list)
+        return 0
+
+    @property
+    def is_gpu(self) -> bool:
+        return self.n_gpu > 0
--- a/src/transformers/benchmark/benchmark_args_utils.py
+++ b/src/transformers/benchmark/benchmark_args_utils.py
@@ -16,11 +16,15 @@

 import dataclasses
 import json
+import logging
 from dataclasses import dataclass, field
 from time import time
 from typing import List


+logger = logging.getLogger(__name__)
+
+
 def list_field(default=None, metadata=None):
    return field(default_factory=lambda: default, metadata=metadata)

@@ -53,6 +57,9 @@ class BenchmarkArguments:
    )

    no_inference: bool = field(default=False, metadata={"help": "Don't benchmark inference of model"})
+    no_cuda: bool = field(default=False, metadata={"help": "Whether to run on available cuda devices"})
+    no_tpu: bool = field(default=False, metadata={"help": "Whether to run on available tpu devices"})
+    fp16: bool = field(default=False, metadata={"help": "Use FP16 to accelerate inference."})
    training: bool = field(default=False, metadata={"help": "Benchmark training of model"})
    verbose: bool = field(default=False, metadata={"help": "Verbose memory tracing"})
    no_speed: bool = field(default=False, metadata={"help": "Don't perform speed measurments"})
@@ -61,6 +68,12 @@ class BenchmarkArguments:
    save_to_csv: bool = field(default=False, metadata={"help": "Save result to a CSV file"})
    log_print: bool = field(default=False, metadata={"help": "Save all print statements in a log file"})
    no_env_print: bool = field(default=False, metadata={"help": "Don't print environment information"})
+    no_multi_process: bool = field(
+        default=False,
+        metadata={
+            "help": "Don't use multiprocessing for memory and speed measurement. It is highly recommended to use multiprocessing for accurate CPU and GPU memory measurements. This option should only be used for debugging / testing and on TPU."
+        },
+    )
    with_lm_head: bool = field(
        default=False,
        metadata={
@@ -101,4 +114,17 @@ class BenchmarkArguments:

    @property
    def model_names(self):
+        assert (
+            len(self.models) > 0
+        ), "Please make sure you provide at least one model name / model identifier, *e.g.* `--models bert-base-cased` or `args.models = ['bert-base-cased']."
        return self.models
+
+    @property
+    def do_multi_processing(self):
+        if self.no_multi_process:
+            return False
+        elif self.is_tpu:
+            logger.info("Multiprocessing is currently not possible on TPU.")
+            return False
+        else:
+            return True
--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
@@ -0,0 +1,226 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    Benchmarking the library on inference and training in PyTorch.
+"""
+
+
+import logging
+import random
+import timeit
+from functools import wraps
+from typing import Callable, Optional
+
+from transformers import (
+    TF_MODEL_MAPPING,
+    TF_MODEL_WITH_LM_HEAD_MAPPING,
+    PretrainedConfig,
+    is_py3nvml_available,
+    is_tf_available,
+)
+
+from .benchmark_utils import (
+    Benchmark,
+    Memory,
+    MemorySummary,
+    measure_peak_memory_cpu,
+    start_memory_tracing,
+    stop_memory_tracing,
+)
+
+
+if is_tf_available():
+    import tensorflow as tf
+    from .benchmark_args_tf import TensorflowBenchmarkArguments
+    from tensorflow.python.framework.errors_impl import ResourceExhaustedError
+
+if is_py3nvml_available():
+    import py3nvml.py3nvml as nvml
+
+logger = logging.getLogger(__name__)
+
+
+def run_with_tf_optimizations(do_eager_mode: bool, use_xla: bool):
+    def run_func(func):
+        @wraps(func)
+        def run_in_eager_mode(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        @wraps(func)
+        @tf.function(experimental_compile=use_xla)
+        def run_in_graph_mode(*args, **kwargs):
+            return func(*args, **kwargs)
+
+        if do_eager_mode is True:
+            assert (
+                use_xla is False
+            ), "Cannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`."
+            return run_in_eager_mode
+        else:
+            return run_in_graph_mode
+
+    return run_func
+
+
+def random_input_ids(batch_size: int, sequence_length: int, vocab_size: int) -> ["tf.Tensor"]:
+    rng = random.Random()
+    values = [rng.randint(0, vocab_size - 1) for i in range(batch_size * sequence_length)]
+    return tf.constant(values, shape=(batch_size, sequence_length), dtype=tf.int32)
+
+
+class TensorflowBenchmark(Benchmark):
+
+    args: TensorflowBenchmarkArguments
+    configs: PretrainedConfig
+    framework: str = "Tensorflow"
+
+    @property
+    def framework_version(self):
+        return tf.__version__
+
+    def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
+        # initialize GPU on separate process
+        strategy = self.args.strategy
+        assert strategy is not None, "A device strategy has to be initialized before using Tensorflow."
+        _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
+        return self._measure_speed(_inference)
+
+    def _train_speed(self, model_name, batch_size, sequence_length):
+        raise NotImplementedError(
+            "Training is currently not really implemented." "Wait for TFTrainer to support CLM and MLM."
+        )
+
+    def _inference_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
+        # initialize GPU on separate process
+        if self.args.is_gpu:
+            tf.config.experimental.set_memory_growth(self.args.gpu_list[self.args.device_idx], True)
+        strategy = self.args.strategy
+        assert strategy is not None, "A device strategy has to be initialized before using Tensorflow."
+        _inference = self._prepare_inference_func(model_name, batch_size, sequence_length)
+        return self._measure_memory(_inference)
+
+    def _train_memory(self, model_name, batch_size, sequence_length):
+        raise NotImplementedError(
+            "Training is currently not really implemented. Wait for TFTrainer to support CLM and MLM."
+        )
+
+    def _prepare_inference_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
+        config = self.config_dict[model_name]
+
+        if self.args.fp16:
+            raise NotImplementedError("Mixed precision is currently not supported.")
+
+        if self.args.with_lm_head:
+            model = TF_MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config)
+        else:
+            model = TF_MODEL_MAPPING[config.__class__](config)
+
+        # encoder-decoder has vocab size saved differently
+        vocab_size = config.vocab_size if hasattr(config, "vocab_size") else config.encoder.vocab_size
+        input_ids = random_input_ids(batch_size, sequence_length, vocab_size)
+
+        @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
+        def encoder_decoder_forward():
+            return model(input_ids, decoder_input_ids=input_ids, training=False)
+
+        @run_with_tf_optimizations(self.args.eager_mode, self.args.use_xla)
+        def encoder_forward():
+            return model(input_ids, training=False)
+
+        _inference = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward
+
+        return _inference
+
+    def _measure_speed(self, func) -> float:
+        with self.args.strategy.scope():
+            try:
+                if self.args.is_tpu or self.args.use_xla:
+                    # run additional 10 times to stabilize compilation for tpu
+                    logger.info("Do inference on TPU. Running model 5 times to stabilize compilation")
+                    timeit.repeat(func, repeat=1, number=5)
+
+                # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
+                runtimes = timeit.repeat(func, repeat=self.args.repeat, number=10,)
+
+                return min(runtimes) / 10.0
+            except ResourceExhaustedError as e:
+                self.print_fn("Doesn't fit on GPU. {}".format(e))
+
+    def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
+        logger.info(
+            "Note that Tensorflow allocates more memory than"
+            "it might need to speed up computation."
+            "The memory reported here corresponds to the memory"
+            "reported by `nvidia-smi`, which can vary depending"
+            "on total available memory on the GPU that is used."
+        )
+        with self.args.strategy.scope():
+            try:
+                if self.args.trace_memory_line_by_line:
+                    assert (
+                        self.args.eager_mode
+                    ), "`args.eager_mode` is set to `False`. Make sure to run model in eager mode to measure memory consumption line by line."
+                    trace = start_memory_tracing("transformers")
+
+                if self.args.is_tpu:
+                    # tpu
+                    raise NotImplementedError(
+                        "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`"
+                    )
+                elif self.args.is_gpu:
+                    # gpu
+                    if not is_py3nvml_available():
+                        logger.warning(
+                            "py3nvml not installed, we won't log GPU memory usage. "
+                            "Install py3nvml (pip install py3nvml) to log information about GPU."
+                        )
+                        memory = "N/A"
+                    else:
+                        logger.info(
+                            "Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU."
+                        )
+                        # init nvml
+                        nvml.nvmlInit()
+                        func()
+                        handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
+                        meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
+                        max_bytes_in_use = meminfo.used
+                        memory = Memory(max_bytes_in_use)
+                        # shutdown nvml
+                        nvml.nvmlShutdown()
+                else:
+                    # cpu
+                    if self.args.trace_memory_line_by_line:
+                        logger.info(
+                            "When enabling line by line tracing, the max peak memory for CPU is inaccurate in Tensorflow."
+                        )
+                        memory = None
+                    else:
+                        memory_bytes = measure_peak_memory_cpu(func)
+                        memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
+                if self.args.trace_memory_line_by_line:
+                    summary = stop_memory_tracing(trace)
+                    if memory is None:
+                        memory = summary.total
+                else:
+                    summary = None
+
+                return memory, summary
+            except ResourceExhaustedError as e:
+                self.print_fn("Doesn't fit on GPU. {}".format(e))
+                return "N/A", None
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@@ -14,14 +14,14 @@ import sys
 from abc import ABC, abstractmethod
 from collections import defaultdict, namedtuple
 from datetime import datetime
-from multiprocessing import Pipe, Process
+from multiprocessing import Pipe, Process, Queue
 from multiprocessing.connection import Connection
 from typing import Callable, Iterable, List, NamedTuple, Optional, Union

 from transformers import AutoConfig, PretrainedConfig
 from transformers import __version__ as version

-from ..file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
+from ..file_utils import is_psutil_available, is_py3nvml_available, is_tf_available, is_torch_available
 from .benchmark_args_utils import BenchmarkArguments


@@ -31,6 +31,11 @@ if is_torch_available():
 if is_tf_available():
    from tensorflow.python.eager import context as tf_context

+if is_psutil_available():
+    import psutil
+
+if is_py3nvml_available():
+    import py3nvml.py3nvml as nvml

 if platform.system() == "Windows":
    from signal import CTRL_C_EVENT as SIGKILL
@@ -56,6 +61,45 @@ BenchmarkOutput = namedtuple(
 )


+def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
+    """
+        This function wraps another function into its own separated process.
+        In order to ensure accurate memory measurements it is important that the function
+        is executed in a separate process
+
+        Args:
+            - `func`: (`callable`): function() -> ...
+                generic function which will be executed in its own separate process
+            - `do_multi_processing`: (`bool`)
+                Whether to run function on separate process or not
+    """
+
+    def multi_process_func(*args, **kwargs):
+        # run function in an individual
+        # process to get correct memory
+        def wrapper_func(queue: Queue, *args):
+            try:
+                result = func(*args)
+            except Exception as e:
+                logger.error(e)
+                print(e)
+                result = "N/A"
+            queue.put(result)
+
+        queue = Queue()
+        p = Process(target=wrapper_func, args=[queue] + list(args))
+        p.start()
+        result = queue.get()
+        p.join()
+        return result
+
+    if do_multi_processing:
+        logging.info("fFunction {func} is executed in its own process...")
+        return multi_process_func
+    else:
+        return func
+
+
 def is_memory_tracing_enabled():
    global _is_memory_tracing_enabled
    return _is_memory_tracing_enabled
@@ -136,7 +180,7 @@ class MemorySummary(NamedTuple):
 MemoryTrace = List[UsedMemoryState]


-def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5) -> int:
+def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int:
    """
        measures peak cpu memory consumption of a given `function`
        running the function for at least interval seconds
@@ -148,24 +192,18 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5) -> int:
            - `function`: (`callable`): function() -> ...
                function without any arguments to measure for which to measure the peak memory

-            - `interval`: (`float`)
+            - `interval`: (`float`, `optional`, defaults to `0.5`)
                interval in second for which to measure the memory usage

+            - `device_idx`: (`int`, `optional`, defaults to `None`)
+                device id for which to measure gpu usage
+
        Returns:
            - `max_memory`: (`int`)
                cosumed memory peak in Bytes
    """
-    try:
-        import psutil
-    except (ImportError):
-        logger.warning(
-            "Psutil not installed, we won't log CPU memory usage. "
-            "Install Psutil (pip install psutil) to use CPU memory tracing."
-        )
-        max_memory = "N/A"
-    else:

-        def _get_memory(process_id: int) -> int:
+    def get_cpu_memory(process_id: int) -> int:
        """
            measures current cpu memory usage of a given `process_id`

@@ -185,6 +223,14 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5) -> int:
            raise ValueError("Error with Psutil.")
        return memory

+    if not is_psutil_available():
+        logger.warning(
+            "Psutil not installed, we won't log CPU memory usage. "
+            "Install Psutil (pip install psutil) to use CPU memory tracing."
+        )
+        max_memory = "N/A"
+    else:
+
        class MemoryMeasureProcess(Process):

            """
@@ -198,13 +244,13 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5) -> int:
                self.interval = interval
                self.connection = child_connection
                self.num_measurements = 1
-                self.mem_usage = _get_memory(process_id)
+                self.mem_usage = get_cpu_memory(self.process_id)

            def run(self):
                self.connection.send(0)
                stop = False
                while True:
-                    self.mem_usage = max(self.mem_usage, _get_memory(self.process_id))
+                    self.mem_usage = max(self.mem_usage, get_cpu_memory(self.process_id))
                    self.num_measurements += 1

                    if stop:
@@ -296,34 +342,31 @@ def start_memory_tracing(
            - 'line_text' (string): Text of the line in the python script

    """
-    try:
-        import psutil
-    except (ImportError):
+    if is_psutil_available():
+        process = psutil.Process(os.getpid())
+    else:
        logger.warning(
            "Psutil not installed, we won't log CPU memory usage. "
            "Install psutil (pip install psutil) to use CPU memory tracing."
        )
        process = None
-    else:
-        process = psutil.Process(os.getpid())

+    if is_py3nvml_available():
        try:
-        from py3nvml import py3nvml
-
-        py3nvml.nvmlInit()
-        devices = list(range(py3nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
-        py3nvml.nvmlShutdown()
-    except ImportError:
+            nvml.nvmlInit()
+            devices = list(range(nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
+            nvml.nvmlShutdown()
+        except (OSError, nvml.NVMLError):
+            logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
+            log_gpu = False
+        else:
+            log_gpu = is_torch_available() or is_tf_available()
+    else:
        logger.warning(
            "py3nvml not installed, we won't log GPU memory usage. "
            "Install py3nvml (pip install py3nvml) to use GPU memory tracing."
        )
        log_gpu = False
-    except (OSError, py3nvml.NVMLError):
-        logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
-        log_gpu = False
-    else:
-        log_gpu = is_torch_available() or is_tf_available()

    memory_trace = []

@@ -385,14 +428,14 @@ def start_memory_tracing(
                tf_context.context()._clear_caches()  # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802

            # Sum used memory for all GPUs
-            py3nvml.nvmlInit()
+            nvml.nvmlInit()

            for i in devices:
-                handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
-                meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
+                handle = nvml.nvmlDeviceGetHandleByIndex(i)
+                meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
                gpu_mem += meminfo.used

-            py3nvml.nvmlShutdown()
+            nvml.nvmlShutdown()

        mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem)
        memory_trace.append(mem_state)
@@ -522,7 +565,6 @@ class Benchmark(ABC):

    def __init__(self, args: BenchmarkArguments = None, configs: PretrainedConfig = None):
        self.args = args
-
        if configs is None:
            self.config_dict = {
                model_name: AutoConfig.from_pretrained(model_name) for model_name in self.args.model_names
@@ -530,6 +572,11 @@ class Benchmark(ABC):
        else:
            self.config_dict = {model_name: config for model_name, config in zip(self.args.model_names, configs)}

+        if not self.args.no_memory and os.getenv("TRANSFORMERS_USE_MULTIPROCESSING") == 0:
+            logger.warning(
+                "Memory consumption will not be measured accurately if `args.no_multi_process` is set to `True.` The flag 'TRANSFORMERS_USE_MULTIPROCESSING' should only be disabled for debugging / testing."
+            )
+
        self._print_fn = None
        self._framework_version = None
        self._environment_info = None
@@ -541,7 +588,7 @@ class Benchmark(ABC):

                def print_and_log(*args):
                    with open(self.args.log_filename, "a") as log_file:
-                        log_file.write(str(*args) + "\n")
+                        log_file.write("".join(args) + "\n")
                    print(*args)

                self._print_fn = print_and_log
@@ -549,27 +596,43 @@ class Benchmark(ABC):
                self._print_fn = print
        return self._print_fn

-    @property
-    def is_gpu(self):
-        return self.args.n_gpu > 0
-
-    @property
-    def is_tpu(self):
-        return is_torch_tpu_available() and not self.args.no_tpu
-
    @property
    @abstractmethod
    def framework_version(self):
        pass

    @abstractmethod
-    def train(self, model_name, batch_size, sequence_length):
+    def _inference_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
        pass

    @abstractmethod
-    def inference(self, model_name, batch_size, sequence_length):
+    def _train_speed(self, model_name: str, batch_size: int, sequence_length: int) -> float:
        pass

+    @abstractmethod
+    def _inference_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
+        pass
+
+    @abstractmethod
+    def _train_memory(
+        self, model_name: str, batch_size: int, sequence_length: int
+    ) -> [Memory, Optional[MemorySummary]]:
+        pass
+
+    def inference_speed(self, *args, **kwargs) -> float:
+        return separate_process_wrapper_fn(self._inference_speed, self.args.do_multi_processing)(*args, **kwargs)
+
+    def train_speed(self, *args, **kwargs) -> float:
+        return separate_process_wrapper_fn(self._train_speed, self.args.do_multi_processing)(*args, **kwargs)
+
+    def inference_memory(self, *args, **kwargs) -> [Memory, Optional[MemorySummary]]:
+        return separate_process_wrapper_fn(self._inference_memory, self.args.do_multi_processing)(*args, **kwargs)
+
+    def train_memory(self, *args, **kwargs) -> [Memory, Optional[MemorySummary]]:
+        return separate_process_wrapper_fn(self._train_memory, self.args.do_multi_processing)(*args, **kwargs)
+
    def run(self):
        result_dict = {model_name: {} for model_name in self.args.model_names}
        inference_result_time = copy.deepcopy(result_dict)
@@ -596,64 +659,60 @@ class Benchmark(ABC):
                for sequence_length in self.args.sequence_lengths:
                    if not self.args.no_inference:
                        if not self.args.no_memory:
-                            memory, inference_summary = self.inference(
-                                model_name, batch_size, sequence_length, trace_memory=True
-                            )
+                            memory, inference_summary = self.inference_memory(model_name, batch_size, sequence_length)
                            inference_result_memory[model_name]["result"][batch_size][sequence_length] = memory
                        if not self.args.no_speed:
-                            time = self.inference(model_name, batch_size, sequence_length, trace_memory=False)
+                            time = self.inference_speed(model_name, batch_size, sequence_length)
                            inference_result_time[model_name]["result"][batch_size][sequence_length] = time

                    if self.args.training:
                        if not self.args.no_memory:
-                            memory, train_summary = self.train(
-                                model_name, batch_size, sequence_length, trace_memory=True
-                            )
+                            memory, train_summary = self.train_memory(model_name, batch_size, sequence_length)
                            train_result_memory[model_name]["result"][batch_size][sequence_length] = memory
                        if not self.args.no_speed:
-                            time = self.inference(model_name, batch_size, sequence_length, trace_memory=False)
+                            time = self.train_speed(model_name, batch_size, sequence_length)
                            train_result_time[model_name]["result"][batch_size][sequence_length] = time

        if not self.args.no_inference:
            if not self.args.no_speed:
-                self.print_fn("======= INFERENCE - SPEED - RESULT =======")
-                self.print_results(inference_result_time)
+                self.print_fn("\n" + 20 * "=" + ("INFERENCE - SPEED - RESULT").center(40) + 20 * "=")
+                self.print_results(inference_result_time, type_label="Time in s")
                self.save_to_csv(inference_result_time, self.args.inference_time_csv_file)
-                if self.is_tpu:
+                if self.args.is_tpu:
                    self.print_fn(
                        "TPU was used for inference. Note that the time after compilation stabilized (after ~10 inferences model.forward(..) calls) was measured."
                    )

            if not self.args.no_memory:
-                self.print_fn("======= INFERENCE - MEMORY - RESULT =======")
-                self.print_results(inference_result_memory)
+                self.print_fn("\n" + 20 * "=" + ("INFERENCE - MEMORY - RESULT").center(40) + 20 * "=")
+                self.print_results(inference_result_memory, type_label="Memory in MB")
                self.save_to_csv(inference_result_memory, self.args.inference_memory_csv_file)

            if self.args.trace_memory_line_by_line:
-                self.print_fn("======= INFERENCE - MEMORY LINE BY LINE TRACE - SUMMARY =======")
+                self.print_fn("\n" + 20 * "=" + ("INFERENCE - MEMOMRY - LINE BY LINE - SUMMARY").center(40) + 20 * "=")
                self.print_memory_trace_statistics(inference_summary)

        if self.args.training:
            if not self.args.no_speed:
-                self.print_fn("======= TRAIN - SPEED - RESULT =======")
-                self.print_results(train_result_time)
+                self.print_fn("\n" + 20 * "=" + ("TRAIN - SPEED - RESULTS").center(40) + 20 * "=")
+                self.print_results(train_result_time, "Time in s")
                self.save_to_csv(train_result_time, self.args.train_time_csv_file)
-                if self.is_tpu:
+                if self.args.is_tpu:
                    self.print_fn(
                        "TPU was used for training. Note that the time after compilation stabilized (after ~10 train loss=model.forward(...) + loss.backward() calls) was measured."
                    )

            if not self.args.no_memory:
-                self.print_fn("======= TRAIN - MEMORY - RESULT =======")
-                self.print_results(train_result_memory)
+                self.print_fn("\n" + 20 * "=" + ("TRAIN - MEMORY - RESULTS").center(40) + 20 * "=")
+                self.print_results(train_result_memory, type_label="Memory in MB")
                self.save_to_csv(train_result_memory, self.args.train_memory_csv_file)

            if self.args.trace_memory_line_by_line:
-                self.print_fn("======= TRAIN - MEMORY LINE BY LINE TRACE - SUMMARY =======")
+                self.print_fn("\n" + 20 * "=" + ("TRAIN - MEMOMRY - LINE BY LINE - SUMMARY").center(40) + 20 * "=")
                self.print_memory_trace_statistics(train_summary)

        if not self.args.no_env_print:
-            self.print_fn("\n======== ENVIRONMENT - INFORMATION ========")
+            self.print_fn("\n" + 20 * "=" + ("ENVIRONMENT INFORMATION").center(40) + 20 * "=")
            self.print_fn(
                "\n".join(["- {}: {}".format(prop, val) for prop, val in self.environment_info.items()]) + "\n"
            )
@@ -681,6 +740,9 @@ class Benchmark(ABC):
            info["framework"] = self.framework
            if self.framework == "PyTorch":
                info["use_torchscript"] = self.args.torchscript
+            if self.framework == "Tensorflow":
+                info["eager_mode"] = self.args.eager_mode
+                info["use_xla"] = self.args.use_xla
            info["framework_version"] = self.framework_version
            info["python_version"] = platform.python_version()
            info["system"] = platform.system()
@@ -688,27 +750,30 @@ class Benchmark(ABC):
            info["architecture"] = platform.architecture()[0]
            info["date"] = datetime.date(datetime.now())
            info["time"] = datetime.time(datetime.now())
+            info["fp16"] = self.args.fp16
+            info["use_multiprocessing"] = self.args.do_multi_processing

-            try:
-                import psutil
-            except (ImportError):
+            if is_psutil_available():
+                info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total)
+            else:
                logger.warning(
                    "Psutil not installed, we won't log available CPU memory."
                    "Install psutil (pip install psutil) to log available CPU memory."
                )
                info["cpu_ram_mb"] = "N/A"
+
+            info["use_gpu"] = self.args.is_gpu
+            if self.args.is_gpu:
+                info["num_gpus"] = 1  # TODO(PVP) Currently only single GPU is supported
+                if is_py3nvml_available():
+                    nvml.nvmlInit()
+                    handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
+                    info["gpu"] = nvml.nvmlDeviceGetName(handle)
+                    info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total)
+                    info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
+                    info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle)
+                    nvml.nvmlShutdown()
                else:
-                info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total)
-
-            info["use_gpu"] = self.is_gpu
-            if self.is_gpu:
-                info["num_gpus"] = self.args.n_gpu
-                try:
-                    from py3nvml import py3nvml
-
-                    py3nvml.nvmlInit()
-                    handle = py3nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx)
-                except ImportError:
                    logger.warning(
                        "py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to log information about GPU."
@@ -717,41 +782,35 @@ class Benchmark(ABC):
                    info["gpu_ram_mb"] = "N/A"
                    info["gpu_power_watts"] = "N/A"
                    info["gpu_performance_state"] = "N/A"
-                except (OSError, py3nvml.NVMLError):
-                    logger.warning(
-                        "Error while initializing comunication with GPU. " "We won't log information about GPU."
-                    )
-                    info["gpu"] = "N/A"
-                    info["gpu_ram_mb"] = "N/A"
-                    info["gpu_power_watts"] = "N/A"
-                    info["gpu_performance_state"] = "N/A"
-                    py3nvml.nvmlShutdown()
-                else:
-                    info["gpu"] = py3nvml.nvmlDeviceGetName(handle)
-                    info["gpu_ram_mb"] = bytes_to_mega_bytes(py3nvml.nvmlDeviceGetMemoryInfo(handle).total)
-                    info["gpu_power_watts"] = py3nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
-                    info["gpu_performance_state"] = py3nvml.nvmlDeviceGetPerformanceState(handle)
-                    py3nvml.nvmlShutdown()

-            info["use_tpu"] = self.is_tpu
+            info["use_tpu"] = self.args.is_tpu
            # TODO(PVP): See if we can add more information about TPU
            # see: https://github.com/pytorch/xla/issues/2180

            self._environment_info = info
        return self._environment_info

-    def print_results(self, result_dict):
+    def print_results(self, result_dict, type_label):
+        self.print_fn(80 * "-")
+        self.print_fn(
+            "Model Name".center(30) + "Batch Size".center(15) + "Seq Length".center(15) + type_label.center(15)
+        )
+        self.print_fn(80 * "-")
        for model_name in self.args.model_names:
-            self.print_fn("\t" + f"======= MODEL CHECKPOINT: {model_name} =======")
            for batch_size in result_dict[model_name]["bs"]:
                for sequence_length in result_dict[model_name]["ss"]:
                    result = result_dict[model_name]["result"][batch_size][sequence_length]
                    if isinstance(result, float):
-                        self.print_fn(
-                            f"\t\t{model_name}/{batch_size}/{sequence_length}: " f"{(round(1000 * result) / 1000)}s"
-                        )
+                        result = round(1000 * result) / 1000
+                        result = "< 0.001" if result == 0.0 else str(result)
                    else:
-                        self.print_fn(f"\t\t{model_name}/{batch_size}/{sequence_length}: " f"{result} MB")
+                        result = str(result)
+                    self.print_fn(
+                        model_name.center(30) + str(batch_size).center(15),
+                        str(sequence_length).center(15),
+                        result.center(15),
+                    )
+        self.print_fn(80 * "-")

    def print_memory_trace_statistics(self, summary: MemorySummary):
        self.print_fn(
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -81,6 +81,31 @@ except ImportError:
    _torch_tpu_available = False


+try:
+    import psutil  # noqa: F401
+
+    _psutil_available = True
+
+except ImportError:
+    _psutil_available = False
+
+
+try:
+    import py3nvml  # noqa: F401
+
+    _py3nvml_available = True
+
+except ImportError:
+    _py3nvml_available = False
+
+
+try:
+    from apex import amp  # noqa: F401
+
+    _has_apex = True
+except ImportError:
+    _has_apex = False
+
 default_cache_path = os.path.join(torch_cache_home, "transformers")


@@ -115,6 +140,18 @@ def is_torch_tpu_available():
    return _torch_tpu_available


+def is_psutil_available():
+    return _psutil_available
+
+
+def is_py3nvml_available():
+    return _py3nvml_available
+
+
+def is_apex_available():
+    return _has_apex
+
+
 def add_start_docstrings(*docstr):
    def docstring_decorator(fn):
        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -20,23 +20,16 @@ from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler
 from tqdm.auto import tqdm, trange

 from .data.data_collator import DataCollator, default_data_collator
+from .file_utils import is_apex_available, is_torch_tpu_available
 from .modeling_utils import PreTrainedModel
 from .optimization import AdamW, get_linear_schedule_with_warmup
 from .trainer_utils import PREFIX_CHECKPOINT_DIR, EvalPrediction, PredictionOutput, TrainOutput, is_wandb_available
-from .training_args import TrainingArguments, is_torch_tpu_available
+from .training_args import TrainingArguments


-try:
+if is_apex_available():
    from apex import amp

-    _has_apex = True
-except ImportError:
-    _has_apex = False
-
-
-def is_apex_available():
-    return _has_apex
-

 if is_torch_tpu_available():
    import torch_xla.core.xla_model as xm
--- a/tests/test_benchmark.py
+++ b/tests/test_benchmark.py
@@ -5,7 +5,7 @@ from pathlib import Path

 from transformers import AutoConfig, is_torch_available

-from .utils import require_torch
+from .utils import require_torch, torch_device


 if is_torch_available():
@@ -26,7 +26,12 @@ class BenchmarkTest(unittest.TestCase):
    def test_inference_no_configs(self):
        MODEL_ID = "sshleifer/tiny-gpt2"
        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=False, no_inference=False, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args)
        results = benchmark.run()
@@ -42,6 +47,24 @@ class BenchmarkTest(unittest.TestCase):
            torchscript=True,
            sequence_lengths=[8],
            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_inference_fp16(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            fp16=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args)
        results = benchmark.run()
@@ -51,7 +74,29 @@ class BenchmarkTest(unittest.TestCase):
    def test_train_no_configs(self):
        MODEL_ID = "sshleifer/tiny-gpt2"
        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=True,
+            no_inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = PyTorchBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_train_result)
+        self.check_results_dict_not_empty(results.memory_train_result)
+
+    @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
+    def test_train_no_configs_fp16(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = PyTorchBenchmarkArguments(
+            models=[MODEL_ID],
+            training=True,
+            no_inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            fp16=True,
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args)
        results = benchmark.run()
@@ -62,7 +107,12 @@ class BenchmarkTest(unittest.TestCase):
        MODEL_ID = "sshleifer/tiny-gpt2"
        config = AutoConfig.from_pretrained(MODEL_ID)
        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=False, no_inference=False, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
        results = benchmark.run()
@@ -73,7 +123,12 @@ class BenchmarkTest(unittest.TestCase):
        MODEL_ID = "sshleifer/tinier_bart"
        config = AutoConfig.from_pretrained(MODEL_ID)
        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=False, no_inference=False, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
        results = benchmark.run()
@@ -81,26 +136,15 @@ class BenchmarkTest(unittest.TestCase):
        self.check_results_dict_not_empty(results.memory_inference_result)

    def test_train_with_configs(self):
-        MODEL_ID = "sshleifer/tiny-gpt2"
-        config = AutoConfig.from_pretrained(MODEL_ID)
-        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]
-        )
-        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
-        results = benchmark.run()
-        self.check_results_dict_not_empty(results.time_train_result)
-        self.check_results_dict_not_empty(results.memory_train_result)
-
-    def test_train_with_configs_torchscript(self):
        MODEL_ID = "sshleifer/tiny-gpt2"
        config = AutoConfig.from_pretrained(MODEL_ID)
        benchmark_args = PyTorchBenchmarkArguments(
            models=[MODEL_ID],
            training=True,
            no_inference=True,
-            torchscript=True,
            sequence_lengths=[8],
            batch_sizes=[1],
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
        results = benchmark.run()
@@ -111,7 +155,12 @@ class BenchmarkTest(unittest.TestCase):
        MODEL_ID = "sshleifer/tinier_bart"
        config = AutoConfig.from_pretrained(MODEL_ID)
        benchmark_args = PyTorchBenchmarkArguments(
-            models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]
+            models=[MODEL_ID],
+            training=True,
+            no_inference=True,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
        )
        benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
        results = benchmark.run()
@@ -133,6 +182,7 @@ class BenchmarkTest(unittest.TestCase):
                inference_memory_csv_file=os.path.join(tmp_dir, "inf_mem.csv"),
                train_time_csv_file=os.path.join(tmp_dir, "train_time.csv"),
                env_info_csv_file=os.path.join(tmp_dir, "env.csv"),
+                no_multi_process=True,
            )
            benchmark = PyTorchBenchmark(benchmark_args)
            benchmark.run()
@@ -161,6 +211,7 @@ class BenchmarkTest(unittest.TestCase):
                log_filename=os.path.join(tmp_dir, "log.txt"),
                log_print=True,
                trace_memory_line_by_line=True,
+                no_multi_process=True,
            )
            benchmark = PyTorchBenchmark(benchmark_args)
            result = benchmark.run()
--- a/tests/test_benchmark_tf.py
+++ b/tests/test_benchmark_tf.py
@@ -0,0 +1,165 @@
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+from transformers import AutoConfig, is_tf_available
+
+from .utils import require_tf
+
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import TensorflowBenchmark, TensorflowBenchmarkArguments
+
+
+@require_tf
+class TFBenchmarkTest(unittest.TestCase):
+    def check_results_dict_not_empty(self, results):
+        for model_result in results.values():
+            for batch_size, sequence_length in zip(model_result["bs"], model_result["ss"]):
+                result = model_result["result"][batch_size][sequence_length]
+                self.assertIsNotNone(result)
+
+    def test_inference_no_configs_eager(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            eager_mode=True,
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_no_configs_graph(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_with_configs_eager(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            eager_mode=True,
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args, [config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_with_configs_graph(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args, [config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_inference_encoder_decoder_with_configs(self):
+        MODEL_ID = "patrickvonplaten/t5-tiny-random"
+        config = AutoConfig.from_pretrained(MODEL_ID)
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args, configs=[config])
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    @unittest.skipIf(is_tf_available() and len(tf.config.list_physical_devices("GPU")) == 0, "Cannot do xla on CPU.")
+    def test_inference_no_configs_xla(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        benchmark_args = TensorflowBenchmarkArguments(
+            models=[MODEL_ID],
+            training=False,
+            no_inference=False,
+            sequence_lengths=[8],
+            batch_sizes=[1],
+            use_xla=True,
+            no_multi_process=True,
+        )
+        benchmark = TensorflowBenchmark(benchmark_args)
+        results = benchmark.run()
+        self.check_results_dict_not_empty(results.time_inference_result)
+        self.check_results_dict_not_empty(results.memory_inference_result)
+
+    def test_save_csv_files(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            benchmark_args = TensorflowBenchmarkArguments(
+                models=[MODEL_ID],
+                no_inference=False,
+                save_to_csv=True,
+                sequence_lengths=[8],
+                batch_sizes=[1],
+                inference_time_csv_file=os.path.join(tmp_dir, "inf_time.csv"),
+                inference_memory_csv_file=os.path.join(tmp_dir, "inf_mem.csv"),
+                env_info_csv_file=os.path.join(tmp_dir, "env.csv"),
+                no_multi_process=True,
+            )
+            benchmark = TensorflowBenchmark(benchmark_args)
+            benchmark.run()
+            self.assertTrue(Path(os.path.join(tmp_dir, "inf_time.csv")).exists())
+            self.assertTrue(Path(os.path.join(tmp_dir, "inf_mem.csv")).exists())
+            self.assertTrue(Path(os.path.join(tmp_dir, "env.csv")).exists())
+
+    def test_trace_memory(self):
+        MODEL_ID = "sshleifer/tiny-gpt2"
+
+        def _check_summary_is_not_empty(summary):
+            self.assertTrue(hasattr(summary, "sequential"))
+            self.assertTrue(hasattr(summary, "cumulative"))
+            self.assertTrue(hasattr(summary, "current"))
+            self.assertTrue(hasattr(summary, "total"))
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            benchmark_args = TensorflowBenchmarkArguments(
+                models=[MODEL_ID],
+                no_inference=False,
+                sequence_lengths=[8],
+                batch_sizes=[1],
+                log_filename=os.path.join(tmp_dir, "log.txt"),
+                log_print=True,
+                trace_memory_line_by_line=True,
+                eager_mode=True,
+                no_multi_process=True,
+            )
+            benchmark = TensorflowBenchmark(benchmark_args)
+            result = benchmark.run()
+            _check_summary_is_not_empty(result.inference_summary)
+            self.assertTrue(Path(os.path.join(tmp_dir, "log.txt")).exists())