Trainer (#3800)

* doc * [tests] Add sample files for a regression task * [HUGE] Trainer * Feedback from @sshleifer * Feedback from @thomwolf + logging tweak * [file_utils] when downloading concurrently, get_from_cache will use the cached file for subsequent processes * [glue] Use default max_seq_length of 128 like before * [glue] move DataTrainingArguments around * [ner] Change interface of InputExample, and align run_{tf,pl} * Re-align the pl scripts a little bit * ner * [ner] Add integration test * Fix language_modeling with API tweak * [ci] Tweak loss target * Don't break console output * amp.initialize: model must be on right device before * [multiple-choice] update for Trainer * Re-align to 827d6d6ef0
2020-04-21 20:11:56 -04:00
parent eb5601b0a5
commit dd9d483d03
41 changed files with 2682 additions and 2567 deletions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -31,6 +31,8 @@ from .benchmark_utils import (
    start_memory_tracing,
    stop_memory_tracing,
 )
+
+# Configurations
 from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
 from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, AutoConfig
 from .configuration_bart import BartConfig
@@ -46,8 +48,6 @@ from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, Open
 from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
 from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
 from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
-
-# Configurations
 from .configuration_utils import PretrainedConfig
 from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
 from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
@@ -121,6 +121,8 @@ from .pipelines import (
    TranslationPipeline,
    pipeline,
 )
+
+# Tokenizers
 from .tokenization_albert import AlbertTokenizer
 from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from .tokenization_bart import BartTokenizer, MBartTokenizer
@@ -136,8 +138,6 @@ from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
 from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
 from .tokenization_t5 import T5Tokenizer
 from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer, TransfoXLTokenizerFast
-
-# Tokenizers
 from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_xlm_roberta import XLMRobertaTokenizer
@@ -162,6 +162,7 @@ if is_torch_available():
        AutoModelForQuestionAnswering,
        AutoModelWithLMHead,
        AutoModelForTokenClassification,
+        AutoModelForMultipleChoice,
        ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
        MODEL_MAPPING,
        MODEL_FOR_PRETRAINING_MAPPING,
@@ -169,6 +170,7 @@ if is_torch_available():
        MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
        MODEL_FOR_QUESTION_ANSWERING_MAPPING,
        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
    )

    from .modeling_bert import (
@@ -320,6 +322,10 @@ if is_torch_available():
        get_linear_schedule_with_warmup,
    )

+    # Trainer
+    from .trainer import Trainer, set_seed, torch_distributed_zero_first, EvalPrediction
+    from .data.data_collator import DefaultDataCollator, DataCollator, DataCollatorForLanguageModeling
+    from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments

 # TensorFlow
 if is_tf_available():
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -87,7 +87,7 @@ class PretrainedConfig(object):
        self.architectures = kwargs.pop("architectures", None)
        self.finetuning_task = kwargs.pop("finetuning_task", None)
        self.num_labels = kwargs.pop("num_labels", 2)
-        self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)})
+        self.id2label = kwargs.pop("id2label", {i: f"LABEL_{i}" for i in range(self.num_labels)})
        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
        self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys())))
        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -0,0 +1,144 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, NewType, Tuple
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+from ..tokenization_utils import PreTrainedTokenizer
+
+
+class DataCollator(ABC):
+    """
+    A `DataCollator` is responsible for batching
+    and pre-processing samples of data as requested by the training loop.
+    """
+
+    @abstractmethod
+    def collate_batch(self) -> Dict[str, torch.Tensor]:
+        """
+        Take a list of samples from a Dataset and collate them into a batch.
+
+        Returns:
+            A dictionary of tensors
+        """
+        pass
+
+
+InputDataClass = NewType("InputDataClass", Any)
+
+
+@dataclass
+class DefaultDataCollator(DataCollator):
+    """
+    Very simple data collator that:
+    - simply collates batches of dict-like objects
+    - Performs special handling for potential keys named:
+        - `label`: handles a single value (int or float) per object
+        - `label_ids`: handles a list of values per object
+    - does not do any additional preprocessing
+
+    i.e., Property names of the input object will be used as corresponding inputs to the model.
+    See glue and ner for example of how it's useful.
+    """
+
+    def collate_batch(self, features: List[InputDataClass]) -> Dict[str, torch.Tensor]:
+        # In this method we'll make the assumption that all `features` in the batch
+        # have the same attributes.
+        # So we will look at the first element as a proxy for what attributes exist
+        # on the whole batch.
+        first = features[0]
+
+        # Special handling for labels.
+        # Ensure that tensor is created with the correct type
+        # (it should be automatically the case, but let's make sure of it.)
+        if hasattr(first, "label") and first.label is not None:
+            if type(first.label) is int:
+                labels = torch.tensor([f.label for f in features], dtype=torch.long)
+            else:
+                labels = torch.tensor([f.label for f in features], dtype=torch.float)
+            batch = {"labels": labels}
+        elif hasattr(first, "label_ids") and first.label_ids is not None:
+            if type(first.label_ids[0]) is int:
+                labels = torch.tensor([f.label_ids for f in features], dtype=torch.long)
+            else:
+                labels = torch.tensor([f.label_ids for f in features], dtype=torch.float)
+            batch = {"labels": labels}
+        else:
+            batch = {}
+
+        # Handling of all other possible attributes.
+        # Again, we will use the first element to figure out which key/values are not None for this model.
+        for k, v in vars(first).items():
+            if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
+                batch[k] = torch.tensor([getattr(f, k) for f in features], dtype=torch.long)
+        return batch
+
+
+@dataclass
+class DataCollatorForLanguageModeling(DataCollator):
+    """
+    Data collator used for language modeling.
+    - collates batches of tensors, honoring their tokenizer's pad_token
+    - preprocesses batches for masked language modeling
+    """
+
+    tokenizer: PreTrainedTokenizer
+    mlm: bool = True
+    mlm_probability: float = 0.15
+
+    def collate_batch(self, examples: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
+        batch = self._tensorize_batch(examples)
+        if self.mlm:
+            inputs, labels = self.mask_tokens(batch)
+            return {"input_ids": inputs, "masked_lm_labels": labels}
+        else:
+            return {"input_ids": batch, "labels": batch}
+
+    def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor:
+        length_of_first = examples[0].size(0)
+        are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+        if are_tensors_same_length:
+            return torch.stack(examples, dim=0)
+        else:
+            if self.tokenizer._pad_token is None:
+                raise ValueError(
+                    "You are attempting to pad samples but the tokenizer you are using"
+                    f" ({self.tokenizer.__class__.__name__}) does not have one."
+                )
+            return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+
+    def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
+            )
+
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+        ]
+        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
+        if self.tokenizer._pad_token is not None:
+            padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            probability_matrix.masked_fill_(padding_mask, value=0.0)
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
--- a/src/transformers/data/datasets/init.py
+++ b/src/transformers/data/datasets/init.py
@@ -0,0 +1,6 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+from .glue import GlueDataset, GlueDataTrainingArguments
+from .language_modeling import LineByLineTextDataset, TextDataset
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@@ -0,0 +1,124 @@
+import logging
+import os
+import time
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import torch
+from torch.utils.data.dataset import Dataset
+
+from ...tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
+from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_xlm_roberta import XLMRobertaTokenizer
+from ...trainer import torch_distributed_zero_first
+from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
+from ..processors.utils import InputFeatures
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class GlueDataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
+    data_dir: str = field(
+        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+    def __post_init__(self):
+        self.task_name = self.task_name.lower()
+
+
+class GlueDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach
+    soon.
+    """
+
+    args: GlueDataTrainingArguments
+    output_mode: str
+    features: List[InputFeatures]
+
+    def __init__(
+        self,
+        args: GlueDataTrainingArguments,
+        tokenizer: PreTrainedTokenizer,
+        limit_length: Optional[int] = None,
+        evaluate=False,
+        local_rank=-1,
+    ):
+        self.args = args
+        processor = glue_processors[args.task_name]()
+        self.output_mode = glue_output_modes[args.task_name]
+        # Load data features from cache or dataset file
+        cached_features_file = os.path.join(
+            args.data_dir,
+            "cached_{}_{}_{}_{}".format(
+                "dev" if evaluate else "train", tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
+            ),
+        )
+        with torch_distributed_zero_first(local_rank):
+            # Make sure only the first process in distributed training processes the dataset,
+            # and the others will use the cache.
+
+            if os.path.exists(cached_features_file) and not args.overwrite_cache:
+                start = time.time()
+                self.features = torch.load(cached_features_file)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+            else:
+                logger.info(f"Creating features from dataset file at {args.data_dir}")
+                label_list = processor.get_labels()
+                if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
+                    RobertaTokenizer,
+                    RobertaTokenizerFast,
+                    XLMRobertaTokenizer,
+                ):
+                    # HACK(label indices are swapped in RoBERTa pretrained model)
+                    label_list[1], label_list[2] = label_list[2], label_list[1]
+                examples = (
+                    processor.get_dev_examples(args.data_dir)
+                    if evaluate
+                    else processor.get_train_examples(args.data_dir)
+                )
+                if limit_length is not None:
+                    examples = examples[:limit_length]
+                self.features = glue_convert_examples_to_features(
+                    examples,
+                    tokenizer,
+                    max_length=args.max_seq_length,
+                    label_list=label_list,
+                    output_mode=self.output_mode,
+                )
+                if local_rank in [-1, 0]:
+                    start = time.time()
+                    torch.save(self.features, cached_features_file)
+                    # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
+                    logger.info(
+                        f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
+                    )
+
+    def __len__(self):
+        return len(self.features)
+
+    def __getitem__(self, i) -> InputFeatures:
+        return self.features[i]
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -0,0 +1,101 @@
+import logging
+import os
+import pickle
+import time
+
+import torch
+from torch.utils.data.dataset import Dataset
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...trainer import torch_distributed_zero_first
+
+
+logger = logging.getLogger(__name__)
+
+
+class TextDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach
+    soon.
+    """
+
+    def __init__(
+        self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, local_rank=-1,
+    ):
+        assert os.path.isfile(file_path)
+
+        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
+
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(
+            directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
+        )
+
+        with torch_distributed_zero_first(local_rank):
+            # Make sure only the first process in distributed training processes the dataset,
+            # and the others will use the cache.
+
+            if os.path.exists(cached_features_file) and not overwrite_cache:
+                start = time.time()
+                with open(cached_features_file, "rb") as handle:
+                    self.examples = pickle.load(handle)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+
+            else:
+                logger.info(f"Creating features from dataset file at {directory}")
+
+                self.examples = []
+                with open(file_path, encoding="utf-8") as f:
+                    text = f.read()
+
+                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
+
+                for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
+                    self.examples.append(
+                        tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
+                    )
+                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
+                # If your dataset is small, first you should loook for a bigger one :-) and second you
+                # can change this behavior by adding (model specific) padding.
+
+                start = time.time()
+                with open(cached_features_file, "wb") as handle:
+                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+                logger.info(
+                    f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
+                )
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i) -> torch.Tensor:
+        return torch.tensor(self.examples[i], dtype=torch.long)
+
+
+class LineByLineTextDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach
+    soon.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1):
+        assert os.path.isfile(file_path)
+        # Here, we do not cache the features, operating under the assumption
+        # that we will soon use fast multithreaded tokenizers from the
+        # `tokenizers` repo everywhere =)
+        logger.info("Creating features from dataset file at %s", file_path)
+
+        with open(file_path, encoding="utf-8") as f:
+            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+
+        lines = lines[:50_000]
+        batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)
+        self.examples = batch_encoding["input_ids"]
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i) -> torch.Tensor:
+        return torch.tensor(self.examples[i], dtype=torch.long)
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -17,6 +17,7 @@

 import logging
 import os
+from enum import Enum
 from typing import List, Optional, Union

 from ...file_utils import is_tf_available
@@ -153,6 +154,11 @@ def _glue_convert_examples_to_features(
    return features


+class OutputMode(Enum):
+    classification = "classification"
+    regression = "regression"
+
+
 class MrpcProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""

--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -14,13 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import copy
 import csv
 import dataclasses
 import json
 import logging
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional, Union

 from ...file_utils import is_tf_available, is_torch_available

@@ -28,7 +27,7 @@ from ...file_utils import is_tf_available, is_torch_available
 logger = logging.getLogger(__name__)


-@dataclass(frozen=False)
+@dataclass
 class InputExample:
    """
    A single training/test example for simple sequence classification.
@@ -50,42 +49,37 @@ class InputExample:

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
-        return json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n"
+        return json.dumps(dataclasses.asdict(self), indent=2) + "\n"


-class InputFeatures(object):
+@dataclass(frozen=True)
+class InputFeatures:
    """
    A single set of features of data.
+    Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
-        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
-        label: Label corresponding to the input
+        token_type_ids: (Optional) Segment token indices to indicate first and second
+            portions of the inputs. Only some models use them.
+        label: (Optional) Label corresponding to the input. Int for classification problems,
+            float for regression problems.
    """

-    def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
-        self.input_ids = input_ids
-        self.attention_mask = attention_mask
-        self.token_type_ids = token_type_ids
-        self.label = label
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
+    input_ids: List[int]
+    attention_mask: Optional[List[int]] = None
+    token_type_ids: Optional[List[int]] = None
+    label: Optional[Union[int, float]] = None

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), sort_keys=True) + "\n"
+        return json.dumps(dataclasses.asdict(self)) + "\n"


-class DataProcessor(object):
+class DataProcessor:
    """Base class for data converters for sequence classification data sets."""

    def get_example_from_tensor_dict(self, tensor_dict):
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -456,6 +456,11 @@ def get_from_cache(
    lock_path = cache_path + ".lock"
    with FileLock(lock_path):

+        # If the download just completed while the lock was activated.
+        if os.path.exists(cache_path) and not force_download:
+            # Even if returning early like here, the lock will be released.
+            return cache_path
+
        if resume_download:
            incomplete_path = cache_path + ".incomplete"

@@ -496,3 +501,50 @@ def get_from_cache(
            json.dump(meta, meta_file)

    return cache_path
+
+
+class cached_property(property):
+    """
+    Descriptor that mimics @property but caches output in member variable.
+
+    From tensorflow_datasets
+
+    Built-in in functools from Python 3.8.
+    """
+
+    def __get__(self, obj, objtype=None):
+        # See docs.python.org/3/howto/descriptor.html#properties
+        if obj is None:
+            return self
+        if self.fget is None:
+            raise AttributeError("unreadable attribute")
+        attr = "__cached_" + self.fget.__name__
+        cached = getattr(obj, attr, None)
+        if cached is None:
+            cached = self.fget(obj)
+            setattr(obj, attr, cached)
+        return cached
+
+
+def torch_required(func):
+    # Chose a different decorator name than in tests so it's clear they are not the same.
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_torch_available():
+            return func(*args, **kwargs)
+        else:
+            raise ImportError(f"Method `{func.__name__}` requires PyTorch.")
+
+    return wrapper
+
+
+def tf_required(func):
+    # Chose a different decorator name than in tests so it's clear they are not the same.
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_tf_available():
+            return func(*args, **kwargs)
+        else:
+            raise ImportError(f"Method `{func.__name__}` requires TF.")
+
+    return wrapper
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -55,6 +55,7 @@ from .modeling_bart import (
 from .modeling_bert import (
    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    BertForMaskedLM,
+    BertForMultipleChoice,
    BertForPreTraining,
    BertForQuestionAnswering,
    BertForSequenceClassification,
@@ -64,6 +65,7 @@ from .modeling_bert import (
 from .modeling_camembert import (
    CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    CamembertForMaskedLM,
+    CamembertForMultipleChoice,
    CamembertForSequenceClassification,
    CamembertForTokenClassification,
    CamembertModel,
@@ -96,6 +98,7 @@ from .modeling_openai import OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTL
 from .modeling_roberta import (
    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
    RobertaForMaskedLM,
+    RobertaForMultipleChoice,
    RobertaForQuestionAnswering,
    RobertaForSequenceClassification,
    RobertaForTokenClassification,
@@ -114,12 +117,14 @@ from .modeling_xlm import (
 from .modeling_xlm_roberta import (
    XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
    XLMRobertaForMaskedLM,
+    XLMRobertaForMultipleChoice,
    XLMRobertaForSequenceClassification,
    XLMRobertaForTokenClassification,
    XLMRobertaModel,
 )
 from .modeling_xlnet import (
    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLNetForMultipleChoice,
    XLNetForQuestionAnsweringSimple,
    XLNetForSequenceClassification,
    XLNetForTokenClassification,
@@ -259,7 +264,18 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
 )


-class AutoModel(object):
+MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
+    [
+        (CamembertConfig, CamembertForMultipleChoice),
+        (XLMRobertaConfig, XLMRobertaForMultipleChoice),
+        (RobertaConfig, RobertaForMultipleChoice),
+        (BertConfig, BertForMultipleChoice),
+        (XLNetConfig, XLNetForMultipleChoice),
+    ]
+)
+
+
+class AutoModel:
    r"""
        :class:`~transformers.AutoModel` is a generic model class
        that will be instantiated as one of the base model classes of the library
@@ -410,7 +426,7 @@ class AutoModel(object):
        )


-class AutoModelForPreTraining(object):
+class AutoModelForPreTraining:
    r"""
        :class:`~transformers.AutoModelForPreTraining` is a generic model class
        that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
@@ -552,7 +568,7 @@ class AutoModelForPreTraining(object):
        )


-class AutoModelWithLMHead(object):
+class AutoModelWithLMHead:
    r"""
        :class:`~transformers.AutoModelWithLMHead` is a generic model class
        that will be instantiated as one of the language modeling model classes of the library
@@ -696,7 +712,7 @@ class AutoModelWithLMHead(object):
        )


-class AutoModelForSequenceClassification(object):
+class AutoModelForSequenceClassification:
    r"""
        :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
        that will be instantiated as one of the sequence classification model classes of the library
@@ -843,7 +859,7 @@ class AutoModelForSequenceClassification(object):
        )


-class AutoModelForQuestionAnswering(object):
+class AutoModelForQuestionAnswering:
    r"""
        :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
        that will be instantiated as one of the question answering model classes of the library
@@ -1126,3 +1142,55 @@ class AutoModelForTokenClassification:
                ", ".join(c.__name__ for c in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()),
            )
        )
+
+
+class AutoModelForMultipleChoice:
+    r"""
+        :class:`~transformers.AutoModelForMultipleChoice` is a generic model class
+        that will be instantiated as one of the multiple choice model classes of the library
+        when created with the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoModelForMultipleChoice is designed to be instantiated "
+            "using the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` or "
+            "`AutoModelForMultipleChoice.from_config(config)` methods."
+        )
+
+    @classmethod
+    def from_config(cls, config):
+        for config_class, model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items():
+            if isinstance(config, config_class):
+                return model_class(config)
+
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__,
+                cls.__name__,
+                ", ".join(c.__name__ for c in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()),
+            )
+        )
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        config = kwargs.pop("config", None)
+        if not isinstance(config, PretrainedConfig):
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        for config_class, model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items():
+            if isinstance(config, config_class):
+                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
+
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__,
+                cls.__name__,
+                ", ".join(c.__name__ for c in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()),
+            )
+        )
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -0,0 +1,558 @@
+import json
+import logging
+import os
+import random
+import re
+import shutil
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data.dataloader import DataLoader
+from torch.utils.data.dataset import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data.sampler import RandomSampler
+from tqdm import tqdm, trange
+
+from .data.data_collator import DataCollator, DefaultDataCollator
+from .modeling_utils import PreTrainedModel
+from .optimization import AdamW, get_linear_schedule_with_warmup
+from .training_args import TrainingArguments
+
+
+try:
+    from apex import amp
+
+    _has_apex = True
+except ImportError:
+    _has_apex = False
+
+
+def is_apex_available():
+    return _has_apex
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+
+    _has_tensorboard = True
+except ImportError:
+    try:
+        from tensorboardX import SummaryWriter
+
+        _has_tensorboard = True
+    except ImportError:
+        _has_tensorboard = False
+
+
+def is_tensorboard_available():
+    return _has_tensorboard
+
+
+logger = logging.getLogger(__name__)
+
+
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    # ^^ safe to call this function even if cuda is not available
+
+
+@contextmanager
+def torch_distributed_zero_first(local_rank: int):
+    """
+    Decorator to make all processes in distributed training wait for the first one (locally) to do something.
+    """
+    if local_rank not in [-1, 0]:
+        torch.distributed.barrier()
+    yield
+    if local_rank == 0:
+        torch.distributed.barrier()
+
+
+class EvalPrediction(NamedTuple):
+    """
+    Evaluation output (always contains labels), to be used
+    to compute metrics.
+    """
+
+    predictions: np.ndarray
+    label_ids: np.ndarray
+
+
+class PredictionOutput(NamedTuple):
+    predictions: np.ndarray
+    label_ids: Optional[np.ndarray]
+    metrics: Optional[Dict[str, float]]
+
+
+class TrainOutput(NamedTuple):
+    global_step: int
+    training_loss: float
+
+
+PREFIX_CHECKPOINT_DIR = "checkpoint"
+
+
+class Trainer:
+    """
+    Trainer is a simple but feature-complete training and eval loop for PyTorch,
+    optimized for Transformers.
+    """
+
+    model: PreTrainedModel
+    args: TrainingArguments
+    data_collator: DataCollator
+    train_dataset: Optional[Dataset]
+    eval_dataset: Optional[Dataset]
+    compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None
+    prediction_loss_only: bool
+    tb_writer: Optional["SummaryWriter"] = None
+
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        args: TrainingArguments,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Dataset] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+        prediction_loss_only=False,
+    ):
+        """
+        Trainer is a simple but feature-complete training and eval loop for PyTorch,
+        optimized for Transformers.
+
+        Args:
+            prediction_loss_only:
+                (Optional) in evaluation and prediction, only return the loss
+        """
+        self.model = model
+        self.args = args
+        if data_collator is not None:
+            self.data_collator = data_collator
+        else:
+            self.data_collator = DefaultDataCollator()
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+        self.compute_metrics = compute_metrics
+        self.prediction_loss_only = prediction_loss_only
+        if is_tensorboard_available() and self.args.local_rank in [-1, 0]:
+            self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir)
+        if not is_tensorboard_available():
+            logger.warning(
+                "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it."
+            )
+        set_seed(self.args.seed)
+        # Create output directory if needed
+        if self.args.local_rank in [-1, 0]:
+            os.makedirs(self.args.output_dir, exist_ok=True)
+
+    def get_train_dataloader(self) -> DataLoader:
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+        train_sampler = (
+            RandomSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset)
+        )
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.args.train_batch_size,
+            sampler=train_sampler,
+            collate_fn=self.data_collator.collate_batch,
+        )
+
+    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        return DataLoader(
+            eval_dataset if eval_dataset is not None else self.eval_dataset,
+            batch_size=self.args.eval_batch_size,
+            shuffle=False,
+            collate_fn=self.data_collator.collate_batch,
+        )
+
+    def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
+        # We use the same batch_size as for eval.
+        return DataLoader(
+            test_dataset,
+            batch_size=self.args.eval_batch_size,
+            shuffle=False,
+            collate_fn=self.data_collator.collate_batch,
+        )
+
+    def get_optimizers(
+        self, num_training_steps: int
+    ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
+        # Prepare optimizer and schedule (linear warmup and decay)
+        no_decay = ["bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
+                "weight_decay": self.args.weight_decay,
+            },
+            {
+                "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
+                "weight_decay": 0.0,
+            },
+        ]
+        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps
+        )
+        return optimizer, scheduler
+
+    def train(self, model_path: Optional[str] = None):
+        """
+        Main training entry point.
+
+        Args:
+            model_path:
+                (Optional) Local path to model if model to train has been instantiated from a local path
+                If present, we will try reloading the optimizer/scheduler states from there.
+        """
+        train_dataloader = self.get_train_dataloader()
+
+        if self.args.max_steps > 0:
+            t_total = self.args.max_steps
+            num_train_epochs = (
+                self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
+            )
+        else:
+            t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs)
+            num_train_epochs = self.args.num_train_epochs
+
+        optimizer, scheduler = self.get_optimizers(num_training_steps=t_total)
+
+        # Check if saved optimizer or scheduler states exist
+        if (
+            model_path is not None
+            and os.path.isfile(os.path.join(model_path, "optimizer.pt"))
+            and os.path.isfile(os.path.join(model_path, "scheduler.pt"))
+        ):
+            # Load in optimizer and scheduler states
+            optimizer.load_state_dict(torch.load(os.path.join(model_path, "optimizer.pt")))
+            scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt")))
+
+        model = self.model
+        model.to(self.args.device)
+        if self.args.fp16:
+            if not is_apex_available():
+                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+            model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level)
+
+        # multi-gpu training (should be after apex fp16 initialization)
+        if self.args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
+        # Distributed training (should be after apex fp16 initialization)
+        if self.args.local_rank != -1:
+            model = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.args.local_rank],
+                output_device=self.args.local_rank,
+                find_unused_parameters=True,
+            )
+
+        if self.tb_writer is not None:
+            self.tb_writer.add_text("args", self.args.to_json_string())
+
+        # Train!
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_dataloader.dataset))
+        logger.info("  Num Epochs = %d", num_train_epochs)
+        logger.info("  Instantaneous batch size per GPU = %d", self.args.per_gpu_train_batch_size)
+        logger.info(
+            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+            self.args.train_batch_size
+            * self.args.gradient_accumulation_steps
+            * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1),
+        )
+        logger.info("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
+        logger.info("  Total optimization steps = %d", t_total)
+
+        global_step = 0
+        epochs_trained = 0
+        steps_trained_in_current_epoch = 0
+        # Check if continuing training from a checkpoint
+        if model_path is not None:
+            # set global_step to global_step of last saved checkpoint from model path
+            try:
+                global_step = int(model_path.split("-")[-1].split("/")[0])
+                epochs_trained = global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps)
+                steps_trained_in_current_epoch = global_step % (
+                    len(train_dataloader) // self.args.gradient_accumulation_steps
+                )
+
+                logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+                logger.info("  Continuing training from epoch %d", epochs_trained)
+                logger.info("  Continuing training from global step %d", global_step)
+                logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
+            except ValueError:
+                global_step = 0
+                logger.info("  Starting fine-tuning.")
+
+        tr_loss = 0.0
+        logging_loss = 0.0
+        model.zero_grad()
+        train_iterator = trange(
+            epochs_trained, int(num_train_epochs), desc="Epoch", disable=self.args.local_rank not in [-1, 0],
+        )
+        for epoch in train_iterator:
+            epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args.local_rank not in [-1, 0])
+            for step, inputs in enumerate(epoch_iterator):
+
+                # Skip past any already trained steps if resuming training
+                if steps_trained_in_current_epoch > 0:
+                    steps_trained_in_current_epoch -= 1
+                    continue
+
+                tr_loss += self._training_step(model, inputs, optimizer)
+
+                if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
+                    # last step in epoch but step is always smaller than gradient_accumulation_steps
+                    len(epoch_iterator) <= self.args.gradient_accumulation_steps
+                    and (step + 1) == len(epoch_iterator)
+                ):
+                    if self.args.fp16:
+                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm)
+                    else:
+                        torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm)
+
+                    optimizer.step()
+                    scheduler.step()
+                    model.zero_grad()
+                    global_step += 1
+
+                    if self.args.local_rank in [-1, 0]:
+                        if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or (
+                            global_step == 1 and self.args.logging_first_step
+                        ):
+                            logs = {}
+                            if self.args.evaluate_during_training:
+                                results = self.evaluate()
+                                for key, value in results.items():
+                                    eval_key = "eval_{}".format(key)
+                                    logs[eval_key] = value
+
+                            loss_scalar = (tr_loss - logging_loss) / self.args.logging_steps
+                            learning_rate_scalar = scheduler.get_last_lr()[0]
+                            logs["learning_rate"] = learning_rate_scalar
+                            logs["loss"] = loss_scalar
+                            logging_loss = tr_loss
+
+                            if self.tb_writer:
+                                for k, v in logs.items():
+                                    self.tb_writer.add_scalar(k, v, global_step)
+                            epoch_iterator.write(json.dumps({**logs, **{"step": global_step}}))
+
+                        if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
+                            # In all cases (even distributed/parallel), self.model is always a reference
+                            # to the model we want to save.
+                            if hasattr(model, "module"):
+                                assert model.module is self.model
+                            else:
+                                assert model is self.model
+                            # Save model checkpoint
+                            output_dir = os.path.join(self.args.output_dir, f"checkpoint-{global_step}")
+                            self.save_model(output_dir)
+                            self._rotate_checkpoints()
+                            torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                            torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                            logger.info("Saving optimizer and scheduler states to %s", output_dir)
+
+                if self.args.max_steps > 0 and global_step > self.args.max_steps:
+                    epoch_iterator.close()
+                    break
+            if self.args.max_steps > 0 and global_step > self.args.max_steps:
+                train_iterator.close()
+                break
+
+        if self.tb_writer:
+            self.tb_writer.close()
+
+        logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
+        return TrainOutput(global_step, tr_loss / global_step)
+
+    def _training_step(
+        self, model: nn.Module, inputs: Dict[str, torch.Tensor], optimizer: torch.optim.Optimizer
+    ) -> float:
+        model.train()
+        for k, v in inputs.items():
+            inputs[k] = v.to(self.args.device)
+
+        outputs = model(**inputs)
+        loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+
+        if self.args.n_gpu > 1:
+            loss = loss.mean()  # mean() to average on multi-gpu parallel training
+        if self.args.gradient_accumulation_steps > 1:
+            loss = loss / self.args.gradient_accumulation_steps
+
+        if self.args.fp16:
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+
+        return loss.item()
+
+    def is_world_master(self) -> bool:
+        """
+        This will be True only in one process, even in distributed mode,
+        even when training on multiple machines.
+        """
+        return self.args.local_rank == -1 or torch.distributed.get_rank() == 0
+
+    def save_model(self, output_dir: Optional[str] = None):
+        """
+        Saving best-practices: if you use default names for the model,
+        you can reload it using from_pretrained().
+
+        Will only save from the master process.
+        """
+        if self.is_world_master():
+            self._save(output_dir)
+
+    def _save(self, output_dir: Optional[str] = None):
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info("Saving model checkpoint to %s", output_dir)
+        # Save a trained model and configuration using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        if not isinstance(self.model, PreTrainedModel):
+            raise ValueError("Trainer.model appears to not be a PreTrainedModel")
+        self.model.save_pretrained(output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+
+    def _sorted_checkpoints(self, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False) -> List[str]:
+        ordering_and_checkpoint_path = []
+
+        glob_checkpoints = Path(self.args.output_dir).glob(f"{checkpoint_prefix}-*")
+
+        for path in glob_checkpoints:
+            if use_mtime:
+                ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
+            else:
+                regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
+                if regex_match and regex_match.groups():
+                    ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+
+        checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+        checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+        return checkpoints_sorted
+
+    def _rotate_checkpoints(self, use_mtime=False) -> None:
+        if not self.args.save_total_limit:
+            return
+        if self.args.save_total_limit <= 0:
+            return
+
+        # Check if we should delete older checkpoint(s)
+        checkpoints_sorted = self._sorted_checkpoints(use_mtime=use_mtime)
+        if len(checkpoints_sorted) <= self.args.save_total_limit:
+            return
+
+        number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - self.args.save_total_limit)
+        checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+        for checkpoint in checkpoints_to_be_deleted:
+            logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
+            shutil.rmtree(checkpoint)
+
+    def evaluate(
+        self, eval_dataset: Optional[Dataset] = None, prediction_loss_only: Optional[bool] = None
+    ) -> Dict[str, float]:
+        """
+        Run evaluation and return metrics.
+
+        The calling script will be responsible for providing a method to compute metrics, as they are
+        task-dependent.
+
+        Args:
+            eval_dataset: (Optional) Pass a dataset if you wish to override
+            the one on the instance.
+        Returns:
+            A dict containing:
+                - the eval loss
+                - the potential metrics computed from the predictions
+        """
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+
+        output = self._prediction_loop(eval_dataloader, description="Evaluation")
+        return output.metrics
+
+    def predict(self, test_dataset: Dataset) -> PredictionOutput:
+        """
+        Run prediction and return predictions and potential metrics.
+
+        Depending on the dataset and your use case, your test dataset may contain labels.
+        In that case, this method will also return metrics, like in evaluate().
+        """
+        test_dataloader = self.get_test_dataloader(test_dataset)
+        return self._prediction_loop(test_dataloader, description="Prediction")
+
+    def _prediction_loop(
+        self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None
+    ) -> PredictionOutput:
+        """
+        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
+
+        Works both with or without labels.
+        """
+
+        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only
+
+        # multi-gpu eval
+        if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel):
+            model = torch.nn.DataParallel(self.model)
+        else:
+            model = self.model
+        model.to(self.args.device)
+
+        logger.info("***** Running %s *****", description)
+        logger.info("  Num examples = %d", len(dataloader.dataset))
+        logger.info("  Batch size = %d", dataloader.batch_size)
+        eval_losses: List[float] = []
+        preds: np.ndarray = None
+        label_ids: np.ndarray = None
+        model.eval()
+
+        for inputs in tqdm(dataloader, desc=description):
+            has_labels = any(inputs.get(k) is not None for k in ["labels", "masked_lm_labels"])
+
+            for k, v in inputs.items():
+                inputs[k] = v.to(self.args.device)
+
+            with torch.no_grad():
+                outputs = model(**inputs)
+                if has_labels:
+                    step_eval_loss, logits = outputs[:2]
+                    eval_losses += [step_eval_loss.mean().item()]
+                else:
+                    logits = outputs[0]
+
+            if not prediction_loss_only:
+                if preds is None:
+                    preds = logits.detach().cpu().numpy()
+                else:
+                    preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                if inputs.get("labels") is not None:
+                    if label_ids is None:
+                        label_ids = inputs["labels"].detach().cpu().numpy()
+                    else:
+                        label_ids = np.append(label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
+
+        if self.compute_metrics is not None and preds is not None and label_ids is not None:
+            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
+        else:
+            metrics = {}
+        if len(eval_losses) > 0:
+            metrics["loss"] = np.mean(eval_losses)
+
+        return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1,5 +1,17 @@
+import dataclasses
+import json
+import logging
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import Optional, Tuple
+
+from .file_utils import cached_property, is_torch_available, torch_required
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.getLogger(__name__)


@dataclass
@@ -22,6 +34,7 @@ class TrainingArguments:

    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
    do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
+    do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
    evaluate_during_training: bool = field(
        default=False, metadata={"help": "Run evaluation during training at each logging step."}
    )
@@ -44,6 +57,8 @@ class TrainingArguments:
    )
    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})

+    logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."})
+    logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
    save_total_limit: Optional[int] = field(
@@ -52,12 +67,6 @@ class TrainingArguments:
            "help": "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default"
        },
    )
-    eval_all_checkpoints: bool = field(
-        default=False,
-        metadata={
-            "help": "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
-        },
-    )
    no_cuda: bool = field(default=False, metadata={"help": "Avoid using CUDA even if it is available"})
    seed: int = field(default=42, metadata={"help": "random seed for initialization"})

@@ -73,3 +82,47 @@ class TrainingArguments:
        },
    )
    local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
+
+    @property
+    def train_batch_size(self) -> int:
+        return self.per_gpu_train_batch_size * max(1, self.n_gpu)
+
+    @property
+    def eval_batch_size(self) -> int:
+        return self.per_gpu_eval_batch_size * max(1, self.n_gpu)
+
+    @cached_property
+    @torch_required
+    def _setup_devices(self) -> Tuple["torch.device", int]:
+        logger.info("PyTorch: setting up devices")
+        if self.no_cuda:
+            device = torch.device("cpu")
+            n_gpu = 0
+        elif self.local_rank == -1:
+            # if n_gpu is > 1 we'll use nn.DataParallel.
+            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            n_gpu = torch.cuda.device_count()
+        else:
+            # Here, we'll use torch.distributed.
+            # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+            torch.distributed.init_process_group(backend="nccl")
+            device = torch.device("cuda", self.local_rank)
+            n_gpu = 1
+        return device, n_gpu
+
+    @property
+    @torch_required
+    def device(self) -> "torch.device":
+        return self._setup_devices[0]
+
+    @property
+    @torch_required
+    def n_gpu(self):
+        return self._setup_devices[1]
+
+    def to_json_string(self):
+        """
+        Serializes this instance to a JSON string.
+        """
+        return json.dumps(dataclasses.asdict(self), indent=2)