Trainer (#3800)
* doc
* [tests] Add sample files for a regression task
* [HUGE] Trainer
* Feedback from @sshleifer
* Feedback from @thomwolf + logging tweak
* [file_utils] when downloading concurrently, get_from_cache will use the cached file for subsequent processes
* [glue] Use default max_seq_length of 128 like before
* [glue] move DataTrainingArguments around
* [ner] Change interface of InputExample, and align run_{tf,pl}
* Re-align the pl scripts a little bit
* ner
* [ner] Add integration test
* Fix language_modeling with API tweak
* [ci] Tweak loss target
* Don't break console output
* amp.initialize: model must be on right device before
* [multiple-choice] update for Trainer
* Re-align to 827d6d6ef0
This commit is contained in:
144
src/transformers/data/data_collator.py
Normal file
144
src/transformers/data/data_collator.py
Normal file
@@ -0,0 +1,144 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, NewType, Tuple
|
||||
|
||||
import torch
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
|
||||
from ..tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
|
||||
class DataCollator(ABC):
|
||||
"""
|
||||
A `DataCollator` is responsible for batching
|
||||
and pre-processing samples of data as requested by the training loop.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def collate_batch(self) -> Dict[str, torch.Tensor]:
|
||||
"""
|
||||
Take a list of samples from a Dataset and collate them into a batch.
|
||||
|
||||
Returns:
|
||||
A dictionary of tensors
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
InputDataClass = NewType("InputDataClass", Any)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DefaultDataCollator(DataCollator):
|
||||
"""
|
||||
Very simple data collator that:
|
||||
- simply collates batches of dict-like objects
|
||||
- Performs special handling for potential keys named:
|
||||
- `label`: handles a single value (int or float) per object
|
||||
- `label_ids`: handles a list of values per object
|
||||
- does not do any additional preprocessing
|
||||
|
||||
i.e., Property names of the input object will be used as corresponding inputs to the model.
|
||||
See glue and ner for example of how it's useful.
|
||||
"""
|
||||
|
||||
def collate_batch(self, features: List[InputDataClass]) -> Dict[str, torch.Tensor]:
|
||||
# In this method we'll make the assumption that all `features` in the batch
|
||||
# have the same attributes.
|
||||
# So we will look at the first element as a proxy for what attributes exist
|
||||
# on the whole batch.
|
||||
first = features[0]
|
||||
|
||||
# Special handling for labels.
|
||||
# Ensure that tensor is created with the correct type
|
||||
# (it should be automatically the case, but let's make sure of it.)
|
||||
if hasattr(first, "label") and first.label is not None:
|
||||
if type(first.label) is int:
|
||||
labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||
else:
|
||||
labels = torch.tensor([f.label for f in features], dtype=torch.float)
|
||||
batch = {"labels": labels}
|
||||
elif hasattr(first, "label_ids") and first.label_ids is not None:
|
||||
if type(first.label_ids[0]) is int:
|
||||
labels = torch.tensor([f.label_ids for f in features], dtype=torch.long)
|
||||
else:
|
||||
labels = torch.tensor([f.label_ids for f in features], dtype=torch.float)
|
||||
batch = {"labels": labels}
|
||||
else:
|
||||
batch = {}
|
||||
|
||||
# Handling of all other possible attributes.
|
||||
# Again, we will use the first element to figure out which key/values are not None for this model.
|
||||
for k, v in vars(first).items():
|
||||
if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
|
||||
batch[k] = torch.tensor([getattr(f, k) for f in features], dtype=torch.long)
|
||||
return batch
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataCollatorForLanguageModeling(DataCollator):
|
||||
"""
|
||||
Data collator used for language modeling.
|
||||
- collates batches of tensors, honoring their tokenizer's pad_token
|
||||
- preprocesses batches for masked language modeling
|
||||
"""
|
||||
|
||||
tokenizer: PreTrainedTokenizer
|
||||
mlm: bool = True
|
||||
mlm_probability: float = 0.15
|
||||
|
||||
def collate_batch(self, examples: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
|
||||
batch = self._tensorize_batch(examples)
|
||||
if self.mlm:
|
||||
inputs, labels = self.mask_tokens(batch)
|
||||
return {"input_ids": inputs, "masked_lm_labels": labels}
|
||||
else:
|
||||
return {"input_ids": batch, "labels": batch}
|
||||
|
||||
def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor:
|
||||
length_of_first = examples[0].size(0)
|
||||
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
|
||||
if are_tensors_same_length:
|
||||
return torch.stack(examples, dim=0)
|
||||
else:
|
||||
if self.tokenizer._pad_token is None:
|
||||
raise ValueError(
|
||||
"You are attempting to pad samples but the tokenizer you are using"
|
||||
f" ({self.tokenizer.__class__.__name__}) does not have one."
|
||||
)
|
||||
return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
|
||||
|
||||
def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
|
||||
"""
|
||||
|
||||
if self.tokenizer.mask_token is None:
|
||||
raise ValueError(
|
||||
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
|
||||
)
|
||||
|
||||
labels = inputs.clone()
|
||||
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
||||
probability_matrix = torch.full(labels.shape, self.mlm_probability)
|
||||
special_tokens_mask = [
|
||||
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
|
||||
]
|
||||
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
|
||||
if self.tokenizer._pad_token is not None:
|
||||
padding_mask = labels.eq(self.tokenizer.pad_token_id)
|
||||
probability_matrix.masked_fill_(padding_mask, value=0.0)
|
||||
masked_indices = torch.bernoulli(probability_matrix).bool()
|
||||
labels[~masked_indices] = -100 # We only compute loss on masked tokens
|
||||
|
||||
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
|
||||
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
|
||||
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
|
||||
|
||||
# 10% of the time, we replace masked input tokens with random word
|
||||
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
|
||||
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
|
||||
inputs[indices_random] = random_words[indices_random]
|
||||
|
||||
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
|
||||
return inputs, labels
|
||||
6
src/transformers/data/datasets/__init__.py
Normal file
6
src/transformers/data/datasets/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
from .glue import GlueDataset, GlueDataTrainingArguments
|
||||
from .language_modeling import LineByLineTextDataset, TextDataset
|
||||
124
src/transformers/data/datasets/glue.py
Normal file
124
src/transformers/data/datasets/glue.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
|
||||
import torch
|
||||
from torch.utils.data.dataset import Dataset
|
||||
|
||||
from ...tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_xlm_roberta import XLMRobertaTokenizer
|
||||
from ...trainer import torch_distributed_zero_first
|
||||
from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
|
||||
from ..processors.utils import InputFeatures
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GlueDataTrainingArguments:
|
||||
"""
|
||||
Arguments pertaining to what data we are going to input our model for training and eval.
|
||||
|
||||
Using `HfArgumentParser` we can turn this class
|
||||
into argparse arguments to be able to specify them on
|
||||
the command line.
|
||||
"""
|
||||
|
||||
task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
|
||||
data_dir: str = field(
|
||||
metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
|
||||
)
|
||||
max_seq_length: int = field(
|
||||
default=128,
|
||||
metadata={
|
||||
"help": "The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded."
|
||||
},
|
||||
)
|
||||
overwrite_cache: bool = field(
|
||||
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
self.task_name = self.task_name.lower()
|
||||
|
||||
|
||||
class GlueDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
"""
|
||||
|
||||
args: GlueDataTrainingArguments
|
||||
output_mode: str
|
||||
features: List[InputFeatures]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
args: GlueDataTrainingArguments,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
limit_length: Optional[int] = None,
|
||||
evaluate=False,
|
||||
local_rank=-1,
|
||||
):
|
||||
self.args = args
|
||||
processor = glue_processors[args.task_name]()
|
||||
self.output_mode = glue_output_modes[args.task_name]
|
||||
# Load data features from cache or dataset file
|
||||
cached_features_file = os.path.join(
|
||||
args.data_dir,
|
||||
"cached_{}_{}_{}_{}".format(
|
||||
"dev" if evaluate else "train", tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
|
||||
),
|
||||
)
|
||||
with torch_distributed_zero_first(local_rank):
|
||||
# Make sure only the first process in distributed training processes the dataset,
|
||||
# and the others will use the cache.
|
||||
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
start = time.time()
|
||||
self.features = torch.load(cached_features_file)
|
||||
logger.info(
|
||||
f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
|
||||
)
|
||||
else:
|
||||
logger.info(f"Creating features from dataset file at {args.data_dir}")
|
||||
label_list = processor.get_labels()
|
||||
if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
|
||||
RobertaTokenizer,
|
||||
RobertaTokenizerFast,
|
||||
XLMRobertaTokenizer,
|
||||
):
|
||||
# HACK(label indices are swapped in RoBERTa pretrained model)
|
||||
label_list[1], label_list[2] = label_list[2], label_list[1]
|
||||
examples = (
|
||||
processor.get_dev_examples(args.data_dir)
|
||||
if evaluate
|
||||
else processor.get_train_examples(args.data_dir)
|
||||
)
|
||||
if limit_length is not None:
|
||||
examples = examples[:limit_length]
|
||||
self.features = glue_convert_examples_to_features(
|
||||
examples,
|
||||
tokenizer,
|
||||
max_length=args.max_seq_length,
|
||||
label_list=label_list,
|
||||
output_mode=self.output_mode,
|
||||
)
|
||||
if local_rank in [-1, 0]:
|
||||
start = time.time()
|
||||
torch.save(self.features, cached_features_file)
|
||||
# ^ This seems to take a lot of time so I want to investigate why and how we can improve.
|
||||
logger.info(
|
||||
f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.features)
|
||||
|
||||
def __getitem__(self, i) -> InputFeatures:
|
||||
return self.features[i]
|
||||
101
src/transformers/data/datasets/language_modeling.py
Normal file
101
src/transformers/data/datasets/language_modeling.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
|
||||
import torch
|
||||
from torch.utils.data.dataset import Dataset
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...trainer import torch_distributed_zero_first
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, local_rank=-1,
|
||||
):
|
||||
assert os.path.isfile(file_path)
|
||||
|
||||
block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
|
||||
|
||||
directory, filename = os.path.split(file_path)
|
||||
cached_features_file = os.path.join(
|
||||
directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
|
||||
)
|
||||
|
||||
with torch_distributed_zero_first(local_rank):
|
||||
# Make sure only the first process in distributed training processes the dataset,
|
||||
# and the others will use the cache.
|
||||
|
||||
if os.path.exists(cached_features_file) and not overwrite_cache:
|
||||
start = time.time()
|
||||
with open(cached_features_file, "rb") as handle:
|
||||
self.examples = pickle.load(handle)
|
||||
logger.info(
|
||||
f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
|
||||
)
|
||||
|
||||
else:
|
||||
logger.info(f"Creating features from dataset file at {directory}")
|
||||
|
||||
self.examples = []
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
text = f.read()
|
||||
|
||||
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
||||
|
||||
for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size
|
||||
self.examples.append(
|
||||
tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
|
||||
)
|
||||
# Note that we are losing the last truncated example here for the sake of simplicity (no padding)
|
||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
||||
# can change this behavior by adding (model specific) padding.
|
||||
|
||||
start = time.time()
|
||||
with open(cached_features_file, "wb") as handle:
|
||||
pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
logger.info(
|
||||
f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, i) -> torch.Tensor:
|
||||
return torch.tensor(self.examples[i], dtype=torch.long)
|
||||
|
||||
|
||||
class LineByLineTextDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1):
|
||||
assert os.path.isfile(file_path)
|
||||
# Here, we do not cache the features, operating under the assumption
|
||||
# that we will soon use fast multithreaded tokenizers from the
|
||||
# `tokenizers` repo everywhere =)
|
||||
logger.info("Creating features from dataset file at %s", file_path)
|
||||
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
|
||||
|
||||
lines = lines[:50_000]
|
||||
batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)
|
||||
self.examples = batch_encoding["input_ids"]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, i) -> torch.Tensor:
|
||||
return torch.tensor(self.examples[i], dtype=torch.long)
|
||||
@@ -17,6 +17,7 @@
|
||||
|
||||
import logging
|
||||
import os
|
||||
from enum import Enum
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...file_utils import is_tf_available
|
||||
@@ -153,6 +154,11 @@ def _glue_convert_examples_to_features(
|
||||
return features
|
||||
|
||||
|
||||
class OutputMode(Enum):
|
||||
classification = "classification"
|
||||
regression = "regression"
|
||||
|
||||
|
||||
class MrpcProcessor(DataProcessor):
|
||||
"""Processor for the MRPC data set (GLUE version)."""
|
||||
|
||||
|
||||
@@ -14,13 +14,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
import csv
|
||||
import dataclasses
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...file_utils import is_tf_available, is_torch_available
|
||||
|
||||
@@ -28,7 +27,7 @@ from ...file_utils import is_tf_available, is_torch_available
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass(frozen=False)
|
||||
@dataclass
|
||||
class InputExample:
|
||||
"""
|
||||
A single training/test example for simple sequence classification.
|
||||
@@ -50,42 +49,37 @@ class InputExample:
|
||||
|
||||
def to_json_string(self):
|
||||
"""Serializes this instance to a JSON string."""
|
||||
return json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n"
|
||||
return json.dumps(dataclasses.asdict(self), indent=2) + "\n"
|
||||
|
||||
|
||||
class InputFeatures(object):
|
||||
@dataclass(frozen=True)
|
||||
class InputFeatures:
|
||||
"""
|
||||
A single set of features of data.
|
||||
Property names are the same names as the corresponding inputs to a model.
|
||||
|
||||
Args:
|
||||
input_ids: Indices of input sequence tokens in the vocabulary.
|
||||
attention_mask: Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
|
||||
token_type_ids: Segment token indices to indicate first and second portions of the inputs.
|
||||
label: Label corresponding to the input
|
||||
token_type_ids: (Optional) Segment token indices to indicate first and second
|
||||
portions of the inputs. Only some models use them.
|
||||
label: (Optional) Label corresponding to the input. Int for classification problems,
|
||||
float for regression problems.
|
||||
"""
|
||||
|
||||
def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
|
||||
self.input_ids = input_ids
|
||||
self.attention_mask = attention_mask
|
||||
self.token_type_ids = token_type_ids
|
||||
self.label = label
|
||||
|
||||
def __repr__(self):
|
||||
return str(self.to_json_string())
|
||||
|
||||
def to_dict(self):
|
||||
"""Serializes this instance to a Python dictionary."""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
return output
|
||||
input_ids: List[int]
|
||||
attention_mask: Optional[List[int]] = None
|
||||
token_type_ids: Optional[List[int]] = None
|
||||
label: Optional[Union[int, float]] = None
|
||||
|
||||
def to_json_string(self):
|
||||
"""Serializes this instance to a JSON string."""
|
||||
return json.dumps(self.to_dict(), sort_keys=True) + "\n"
|
||||
return json.dumps(dataclasses.asdict(self)) + "\n"
|
||||
|
||||
|
||||
class DataProcessor(object):
|
||||
class DataProcessor:
|
||||
"""Base class for data converters for sequence classification data sets."""
|
||||
|
||||
def get_example_from_tensor_dict(self, tensor_dict):
|
||||
|
||||
Reference in New Issue
Block a user