Trainer (#3800)

* doc * [tests] Add sample files for a regression task * [HUGE] Trainer * Feedback from @sshleifer * Feedback from @thomwolf + logging tweak * [file_utils] when downloading concurrently, get_from_cache will use the cached file for subsequent processes * [glue] Use default max_seq_length of 128 like before * [glue] move DataTrainingArguments around * [ner] Change interface of InputExample, and align run_{tf,pl} * Re-align the pl scripts a little bit * ner * [ner] Add integration test * Fix language_modeling with API tweak * [ci] Tweak loss target * Don't break console output * amp.initialize: model must be on right device before * [multiple-choice] update for Trainer * Re-align to 827d6d6ef0
2020-04-21 20:11:56 -04:00
parent eb5601b0a5
commit dd9d483d03
41 changed files with 2682 additions and 2567 deletions
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -0,0 +1,144 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, NewType, Tuple
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+from ..tokenization_utils import PreTrainedTokenizer
+
+
+class DataCollator(ABC):
+    """
+    A `DataCollator` is responsible for batching
+    and pre-processing samples of data as requested by the training loop.
+    """
+
+    @abstractmethod
+    def collate_batch(self) -> Dict[str, torch.Tensor]:
+        """
+        Take a list of samples from a Dataset and collate them into a batch.
+
+        Returns:
+            A dictionary of tensors
+        """
+        pass
+
+
+InputDataClass = NewType("InputDataClass", Any)
+
+
+@dataclass
+class DefaultDataCollator(DataCollator):
+    """
+    Very simple data collator that:
+    - simply collates batches of dict-like objects
+    - Performs special handling for potential keys named:
+        - `label`: handles a single value (int or float) per object
+        - `label_ids`: handles a list of values per object
+    - does not do any additional preprocessing
+
+    i.e., Property names of the input object will be used as corresponding inputs to the model.
+    See glue and ner for example of how it's useful.
+    """
+
+    def collate_batch(self, features: List[InputDataClass]) -> Dict[str, torch.Tensor]:
+        # In this method we'll make the assumption that all `features` in the batch
+        # have the same attributes.
+        # So we will look at the first element as a proxy for what attributes exist
+        # on the whole batch.
+        first = features[0]
+
+        # Special handling for labels.
+        # Ensure that tensor is created with the correct type
+        # (it should be automatically the case, but let's make sure of it.)
+        if hasattr(first, "label") and first.label is not None:
+            if type(first.label) is int:
+                labels = torch.tensor([f.label for f in features], dtype=torch.long)
+            else:
+                labels = torch.tensor([f.label for f in features], dtype=torch.float)
+            batch = {"labels": labels}
+        elif hasattr(first, "label_ids") and first.label_ids is not None:
+            if type(first.label_ids[0]) is int:
+                labels = torch.tensor([f.label_ids for f in features], dtype=torch.long)
+            else:
+                labels = torch.tensor([f.label_ids for f in features], dtype=torch.float)
+            batch = {"labels": labels}
+        else:
+            batch = {}
+
+        # Handling of all other possible attributes.
+        # Again, we will use the first element to figure out which key/values are not None for this model.
+        for k, v in vars(first).items():
+            if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
+                batch[k] = torch.tensor([getattr(f, k) for f in features], dtype=torch.long)
+        return batch
+
+
+@dataclass
+class DataCollatorForLanguageModeling(DataCollator):
+    """
+    Data collator used for language modeling.
+    - collates batches of tensors, honoring their tokenizer's pad_token
+    - preprocesses batches for masked language modeling
+    """
+
+    tokenizer: PreTrainedTokenizer
+    mlm: bool = True
+    mlm_probability: float = 0.15
+
+    def collate_batch(self, examples: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
+        batch = self._tensorize_batch(examples)
+        if self.mlm:
+            inputs, labels = self.mask_tokens(batch)
+            return {"input_ids": inputs, "masked_lm_labels": labels}
+        else:
+            return {"input_ids": batch, "labels": batch}
+
+    def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor:
+        length_of_first = examples[0].size(0)
+        are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+        if are_tensors_same_length:
+            return torch.stack(examples, dim=0)
+        else:
+            if self.tokenizer._pad_token is None:
+                raise ValueError(
+                    "You are attempting to pad samples but the tokenizer you are using"
+                    f" ({self.tokenizer.__class__.__name__}) does not have one."
+                )
+            return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+
+    def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
+            )
+
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+        probability_matrix = torch.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+        ]
+        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
+        if self.tokenizer._pad_token is not None:
+            padding_mask = labels.eq(self.tokenizer.pad_token_id)
+            probability_matrix.masked_fill_(padding_mask, value=0.0)
+        masked_indices = torch.bernoulli(probability_matrix).bool()
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
--- a/src/transformers/data/datasets/init.py
+++ b/src/transformers/data/datasets/init.py
@@ -0,0 +1,6 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+from .glue import GlueDataset, GlueDataTrainingArguments
+from .language_modeling import LineByLineTextDataset, TextDataset
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@@ -0,0 +1,124 @@
+import logging
+import os
+import time
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import torch
+from torch.utils.data.dataset import Dataset
+
+from ...tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
+from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_xlm_roberta import XLMRobertaTokenizer
+from ...trainer import torch_distributed_zero_first
+from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
+from ..processors.utils import InputFeatures
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class GlueDataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
+    data_dir: str = field(
+        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+    def __post_init__(self):
+        self.task_name = self.task_name.lower()
+
+
+class GlueDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach
+    soon.
+    """
+
+    args: GlueDataTrainingArguments
+    output_mode: str
+    features: List[InputFeatures]
+
+    def __init__(
+        self,
+        args: GlueDataTrainingArguments,
+        tokenizer: PreTrainedTokenizer,
+        limit_length: Optional[int] = None,
+        evaluate=False,
+        local_rank=-1,
+    ):
+        self.args = args
+        processor = glue_processors[args.task_name]()
+        self.output_mode = glue_output_modes[args.task_name]
+        # Load data features from cache or dataset file
+        cached_features_file = os.path.join(
+            args.data_dir,
+            "cached_{}_{}_{}_{}".format(
+                "dev" if evaluate else "train", tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
+            ),
+        )
+        with torch_distributed_zero_first(local_rank):
+            # Make sure only the first process in distributed training processes the dataset,
+            # and the others will use the cache.
+
+            if os.path.exists(cached_features_file) and not args.overwrite_cache:
+                start = time.time()
+                self.features = torch.load(cached_features_file)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+            else:
+                logger.info(f"Creating features from dataset file at {args.data_dir}")
+                label_list = processor.get_labels()
+                if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
+                    RobertaTokenizer,
+                    RobertaTokenizerFast,
+                    XLMRobertaTokenizer,
+                ):
+                    # HACK(label indices are swapped in RoBERTa pretrained model)
+                    label_list[1], label_list[2] = label_list[2], label_list[1]
+                examples = (
+                    processor.get_dev_examples(args.data_dir)
+                    if evaluate
+                    else processor.get_train_examples(args.data_dir)
+                )
+                if limit_length is not None:
+                    examples = examples[:limit_length]
+                self.features = glue_convert_examples_to_features(
+                    examples,
+                    tokenizer,
+                    max_length=args.max_seq_length,
+                    label_list=label_list,
+                    output_mode=self.output_mode,
+                )
+                if local_rank in [-1, 0]:
+                    start = time.time()
+                    torch.save(self.features, cached_features_file)
+                    # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
+                    logger.info(
+                        f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
+                    )
+
+    def __len__(self):
+        return len(self.features)
+
+    def __getitem__(self, i) -> InputFeatures:
+        return self.features[i]
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -0,0 +1,101 @@
+import logging
+import os
+import pickle
+import time
+
+import torch
+from torch.utils.data.dataset import Dataset
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...trainer import torch_distributed_zero_first
+
+
+logger = logging.getLogger(__name__)
+
+
+class TextDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach
+    soon.
+    """
+
+    def __init__(
+        self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, local_rank=-1,
+    ):
+        assert os.path.isfile(file_path)
+
+        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
+
+        directory, filename = os.path.split(file_path)
+        cached_features_file = os.path.join(
+            directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
+        )
+
+        with torch_distributed_zero_first(local_rank):
+            # Make sure only the first process in distributed training processes the dataset,
+            # and the others will use the cache.
+
+            if os.path.exists(cached_features_file) and not overwrite_cache:
+                start = time.time()
+                with open(cached_features_file, "rb") as handle:
+                    self.examples = pickle.load(handle)
+                logger.info(
+                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
+                )
+
+            else:
+                logger.info(f"Creating features from dataset file at {directory}")
+
+                self.examples = []
+                with open(file_path, encoding="utf-8") as f:
+                    text = f.read()
+
+                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
+
+                for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
+                    self.examples.append(
+                        tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
+                    )
+                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
+                # If your dataset is small, first you should loook for a bigger one :-) and second you
+                # can change this behavior by adding (model specific) padding.
+
+                start = time.time()
+                with open(cached_features_file, "wb") as handle:
+                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
+                logger.info(
+                    f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
+                )
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i) -> torch.Tensor:
+        return torch.tensor(self.examples[i], dtype=torch.long)
+
+
+class LineByLineTextDataset(Dataset):
+    """
+    This will be superseded by a framework-agnostic approach
+    soon.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1):
+        assert os.path.isfile(file_path)
+        # Here, we do not cache the features, operating under the assumption
+        # that we will soon use fast multithreaded tokenizers from the
+        # `tokenizers` repo everywhere =)
+        logger.info("Creating features from dataset file at %s", file_path)
+
+        with open(file_path, encoding="utf-8") as f:
+            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+
+        lines = lines[:50_000]
+        batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)
+        self.examples = batch_encoding["input_ids"]
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i) -> torch.Tensor:
+        return torch.tensor(self.examples[i], dtype=torch.long)
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -17,6 +17,7 @@

 import logging
 import os
+from enum import Enum
 from typing import List, Optional, Union

 from ...file_utils import is_tf_available
@@ -153,6 +154,11 @@ def _glue_convert_examples_to_features(
    return features


+class OutputMode(Enum):
+    classification = "classification"
+    regression = "regression"
+
+
 class MrpcProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""

--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -14,13 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import copy
 import csv
 import dataclasses
 import json
 import logging
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional, Union

 from ...file_utils import is_tf_available, is_torch_available

@@ -28,7 +27,7 @@ from ...file_utils import is_tf_available, is_torch_available
 logger = logging.getLogger(__name__)


-@dataclass(frozen=False)
+@dataclass
 class InputExample:
    """
    A single training/test example for simple sequence classification.
@@ -50,42 +49,37 @@ class InputExample:

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
-        return json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n"
+        return json.dumps(dataclasses.asdict(self), indent=2) + "\n"


-class InputFeatures(object):
+@dataclass(frozen=True)
+class InputFeatures:
    """
    A single set of features of data.
+    Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
-        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
-        label: Label corresponding to the input
+        token_type_ids: (Optional) Segment token indices to indicate first and second
+            portions of the inputs. Only some models use them.
+        label: (Optional) Label corresponding to the input. Int for classification problems,
+            float for regression problems.
    """

-    def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
-        self.input_ids = input_ids
-        self.attention_mask = attention_mask
-        self.token_type_ids = token_type_ids
-        self.label = label
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
+    input_ids: List[int]
+    attention_mask: Optional[List[int]] = None
+    token_type_ids: Optional[List[int]] = None
+    label: Optional[Union[int, float]] = None

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), sort_keys=True) + "\n"
+        return json.dumps(dataclasses.asdict(self)) + "\n"


-class DataProcessor(object):
+class DataProcessor:
    """Base class for data converters for sequence classification data sets."""

    def get_example_from_tensor_dict(self, tensor_dict):