Trainer (#3800)
* doc
* [tests] Add sample files for a regression task
* [HUGE] Trainer
* Feedback from @sshleifer
* Feedback from @thomwolf + logging tweak
* [file_utils] when downloading concurrently, get_from_cache will use the cached file for subsequent processes
* [glue] Use default max_seq_length of 128 like before
* [glue] move DataTrainingArguments around
* [ner] Change interface of InputExample, and align run_{tf,pl}
* Re-align the pl scripts a little bit
* ner
* [ner] Add integration test
* Fix language_modeling with API tweak
* [ci] Tweak loss target
* Don't break console output
* amp.initialize: model must be on right device before
* [multiple-choice] update for Trainer
* Re-align to 827d6d6ef0
This commit is contained in:
@@ -31,6 +31,8 @@ from .benchmark_utils import (
|
||||
start_memory_tracing,
|
||||
stop_memory_tracing,
|
||||
)
|
||||
|
||||
# Configurations
|
||||
from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
|
||||
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, AutoConfig
|
||||
from .configuration_bart import BartConfig
|
||||
@@ -46,8 +48,6 @@ from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, Open
|
||||
from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
|
||||
from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
|
||||
from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
|
||||
|
||||
# Configurations
|
||||
from .configuration_utils import PretrainedConfig
|
||||
from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig
|
||||
from .configuration_xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
|
||||
@@ -121,6 +121,8 @@ from .pipelines import (
|
||||
TranslationPipeline,
|
||||
pipeline,
|
||||
)
|
||||
|
||||
# Tokenizers
|
||||
from .tokenization_albert import AlbertTokenizer
|
||||
from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
|
||||
from .tokenization_bart import BartTokenizer, MBartTokenizer
|
||||
@@ -136,8 +138,6 @@ from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
|
||||
from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
|
||||
from .tokenization_t5 import T5Tokenizer
|
||||
from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer, TransfoXLTokenizerFast
|
||||
|
||||
# Tokenizers
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
from .tokenization_xlm import XLMTokenizer
|
||||
from .tokenization_xlm_roberta import XLMRobertaTokenizer
|
||||
@@ -162,6 +162,7 @@ if is_torch_available():
|
||||
AutoModelForQuestionAnswering,
|
||||
AutoModelWithLMHead,
|
||||
AutoModelForTokenClassification,
|
||||
AutoModelForMultipleChoice,
|
||||
ALL_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
MODEL_MAPPING,
|
||||
MODEL_FOR_PRETRAINING_MAPPING,
|
||||
@@ -169,6 +170,7 @@ if is_torch_available():
|
||||
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
|
||||
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
||||
MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
|
||||
)
|
||||
|
||||
from .modeling_bert import (
|
||||
@@ -320,6 +322,10 @@ if is_torch_available():
|
||||
get_linear_schedule_with_warmup,
|
||||
)
|
||||
|
||||
# Trainer
|
||||
from .trainer import Trainer, set_seed, torch_distributed_zero_first, EvalPrediction
|
||||
from .data.data_collator import DefaultDataCollator, DataCollator, DataCollatorForLanguageModeling
|
||||
from .data.datasets import GlueDataset, TextDataset, LineByLineTextDataset, GlueDataTrainingArguments
|
||||
|
||||
# TensorFlow
|
||||
if is_tf_available():
|
||||
|
||||
@@ -87,7 +87,7 @@ class PretrainedConfig(object):
|
||||
self.architectures = kwargs.pop("architectures", None)
|
||||
self.finetuning_task = kwargs.pop("finetuning_task", None)
|
||||
self.num_labels = kwargs.pop("num_labels", 2)
|
||||
self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)})
|
||||
self.id2label = kwargs.pop("id2label", {i: f"LABEL_{i}" for i in range(self.num_labels)})
|
||||
self.id2label = dict((int(key), value) for key, value in self.id2label.items())
|
||||
self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys())))
|
||||
self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
|
||||
|
||||
144
src/transformers/data/data_collator.py
Normal file
144
src/transformers/data/data_collator.py
Normal file
@@ -0,0 +1,144 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, NewType, Tuple
|
||||
|
||||
import torch
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
|
||||
from ..tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
|
||||
class DataCollator(ABC):
|
||||
"""
|
||||
A `DataCollator` is responsible for batching
|
||||
and pre-processing samples of data as requested by the training loop.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def collate_batch(self) -> Dict[str, torch.Tensor]:
|
||||
"""
|
||||
Take a list of samples from a Dataset and collate them into a batch.
|
||||
|
||||
Returns:
|
||||
A dictionary of tensors
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
InputDataClass = NewType("InputDataClass", Any)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DefaultDataCollator(DataCollator):
|
||||
"""
|
||||
Very simple data collator that:
|
||||
- simply collates batches of dict-like objects
|
||||
- Performs special handling for potential keys named:
|
||||
- `label`: handles a single value (int or float) per object
|
||||
- `label_ids`: handles a list of values per object
|
||||
- does not do any additional preprocessing
|
||||
|
||||
i.e., Property names of the input object will be used as corresponding inputs to the model.
|
||||
See glue and ner for example of how it's useful.
|
||||
"""
|
||||
|
||||
def collate_batch(self, features: List[InputDataClass]) -> Dict[str, torch.Tensor]:
|
||||
# In this method we'll make the assumption that all `features` in the batch
|
||||
# have the same attributes.
|
||||
# So we will look at the first element as a proxy for what attributes exist
|
||||
# on the whole batch.
|
||||
first = features[0]
|
||||
|
||||
# Special handling for labels.
|
||||
# Ensure that tensor is created with the correct type
|
||||
# (it should be automatically the case, but let's make sure of it.)
|
||||
if hasattr(first, "label") and first.label is not None:
|
||||
if type(first.label) is int:
|
||||
labels = torch.tensor([f.label for f in features], dtype=torch.long)
|
||||
else:
|
||||
labels = torch.tensor([f.label for f in features], dtype=torch.float)
|
||||
batch = {"labels": labels}
|
||||
elif hasattr(first, "label_ids") and first.label_ids is not None:
|
||||
if type(first.label_ids[0]) is int:
|
||||
labels = torch.tensor([f.label_ids for f in features], dtype=torch.long)
|
||||
else:
|
||||
labels = torch.tensor([f.label_ids for f in features], dtype=torch.float)
|
||||
batch = {"labels": labels}
|
||||
else:
|
||||
batch = {}
|
||||
|
||||
# Handling of all other possible attributes.
|
||||
# Again, we will use the first element to figure out which key/values are not None for this model.
|
||||
for k, v in vars(first).items():
|
||||
if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
|
||||
batch[k] = torch.tensor([getattr(f, k) for f in features], dtype=torch.long)
|
||||
return batch
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataCollatorForLanguageModeling(DataCollator):
|
||||
"""
|
||||
Data collator used for language modeling.
|
||||
- collates batches of tensors, honoring their tokenizer's pad_token
|
||||
- preprocesses batches for masked language modeling
|
||||
"""
|
||||
|
||||
tokenizer: PreTrainedTokenizer
|
||||
mlm: bool = True
|
||||
mlm_probability: float = 0.15
|
||||
|
||||
def collate_batch(self, examples: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
|
||||
batch = self._tensorize_batch(examples)
|
||||
if self.mlm:
|
||||
inputs, labels = self.mask_tokens(batch)
|
||||
return {"input_ids": inputs, "masked_lm_labels": labels}
|
||||
else:
|
||||
return {"input_ids": batch, "labels": batch}
|
||||
|
||||
def _tensorize_batch(self, examples: List[torch.Tensor]) -> torch.Tensor:
|
||||
length_of_first = examples[0].size(0)
|
||||
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
|
||||
if are_tensors_same_length:
|
||||
return torch.stack(examples, dim=0)
|
||||
else:
|
||||
if self.tokenizer._pad_token is None:
|
||||
raise ValueError(
|
||||
"You are attempting to pad samples but the tokenizer you are using"
|
||||
f" ({self.tokenizer.__class__.__name__}) does not have one."
|
||||
)
|
||||
return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
|
||||
|
||||
def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
|
||||
"""
|
||||
|
||||
if self.tokenizer.mask_token is None:
|
||||
raise ValueError(
|
||||
"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
|
||||
)
|
||||
|
||||
labels = inputs.clone()
|
||||
# We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
|
||||
probability_matrix = torch.full(labels.shape, self.mlm_probability)
|
||||
special_tokens_mask = [
|
||||
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
|
||||
]
|
||||
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
|
||||
if self.tokenizer._pad_token is not None:
|
||||
padding_mask = labels.eq(self.tokenizer.pad_token_id)
|
||||
probability_matrix.masked_fill_(padding_mask, value=0.0)
|
||||
masked_indices = torch.bernoulli(probability_matrix).bool()
|
||||
labels[~masked_indices] = -100 # We only compute loss on masked tokens
|
||||
|
||||
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
|
||||
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
|
||||
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
|
||||
|
||||
# 10% of the time, we replace masked input tokens with random word
|
||||
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
|
||||
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
|
||||
inputs[indices_random] = random_words[indices_random]
|
||||
|
||||
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
|
||||
return inputs, labels
|
||||
6
src/transformers/data/datasets/__init__.py
Normal file
6
src/transformers/data/datasets/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
from .glue import GlueDataset, GlueDataTrainingArguments
|
||||
from .language_modeling import LineByLineTextDataset, TextDataset
|
||||
124
src/transformers/data/datasets/glue.py
Normal file
124
src/transformers/data/datasets/glue.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
|
||||
import torch
|
||||
from torch.utils.data.dataset import Dataset
|
||||
|
||||
from ...tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...tokenization_xlm_roberta import XLMRobertaTokenizer
|
||||
from ...trainer import torch_distributed_zero_first
|
||||
from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
|
||||
from ..processors.utils import InputFeatures
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GlueDataTrainingArguments:
|
||||
"""
|
||||
Arguments pertaining to what data we are going to input our model for training and eval.
|
||||
|
||||
Using `HfArgumentParser` we can turn this class
|
||||
into argparse arguments to be able to specify them on
|
||||
the command line.
|
||||
"""
|
||||
|
||||
task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
|
||||
data_dir: str = field(
|
||||
metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
|
||||
)
|
||||
max_seq_length: int = field(
|
||||
default=128,
|
||||
metadata={
|
||||
"help": "The maximum total input sequence length after tokenization. Sequences longer "
|
||||
"than this will be truncated, sequences shorter will be padded."
|
||||
},
|
||||
)
|
||||
overwrite_cache: bool = field(
|
||||
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
self.task_name = self.task_name.lower()
|
||||
|
||||
|
||||
class GlueDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
"""
|
||||
|
||||
args: GlueDataTrainingArguments
|
||||
output_mode: str
|
||||
features: List[InputFeatures]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
args: GlueDataTrainingArguments,
|
||||
tokenizer: PreTrainedTokenizer,
|
||||
limit_length: Optional[int] = None,
|
||||
evaluate=False,
|
||||
local_rank=-1,
|
||||
):
|
||||
self.args = args
|
||||
processor = glue_processors[args.task_name]()
|
||||
self.output_mode = glue_output_modes[args.task_name]
|
||||
# Load data features from cache or dataset file
|
||||
cached_features_file = os.path.join(
|
||||
args.data_dir,
|
||||
"cached_{}_{}_{}_{}".format(
|
||||
"dev" if evaluate else "train", tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
|
||||
),
|
||||
)
|
||||
with torch_distributed_zero_first(local_rank):
|
||||
# Make sure only the first process in distributed training processes the dataset,
|
||||
# and the others will use the cache.
|
||||
|
||||
if os.path.exists(cached_features_file) and not args.overwrite_cache:
|
||||
start = time.time()
|
||||
self.features = torch.load(cached_features_file)
|
||||
logger.info(
|
||||
f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
|
||||
)
|
||||
else:
|
||||
logger.info(f"Creating features from dataset file at {args.data_dir}")
|
||||
label_list = processor.get_labels()
|
||||
if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
|
||||
RobertaTokenizer,
|
||||
RobertaTokenizerFast,
|
||||
XLMRobertaTokenizer,
|
||||
):
|
||||
# HACK(label indices are swapped in RoBERTa pretrained model)
|
||||
label_list[1], label_list[2] = label_list[2], label_list[1]
|
||||
examples = (
|
||||
processor.get_dev_examples(args.data_dir)
|
||||
if evaluate
|
||||
else processor.get_train_examples(args.data_dir)
|
||||
)
|
||||
if limit_length is not None:
|
||||
examples = examples[:limit_length]
|
||||
self.features = glue_convert_examples_to_features(
|
||||
examples,
|
||||
tokenizer,
|
||||
max_length=args.max_seq_length,
|
||||
label_list=label_list,
|
||||
output_mode=self.output_mode,
|
||||
)
|
||||
if local_rank in [-1, 0]:
|
||||
start = time.time()
|
||||
torch.save(self.features, cached_features_file)
|
||||
# ^ This seems to take a lot of time so I want to investigate why and how we can improve.
|
||||
logger.info(
|
||||
f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.features)
|
||||
|
||||
def __getitem__(self, i) -> InputFeatures:
|
||||
return self.features[i]
|
||||
101
src/transformers/data/datasets/language_modeling.py
Normal file
101
src/transformers/data/datasets/language_modeling.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import time
|
||||
|
||||
import torch
|
||||
from torch.utils.data.dataset import Dataset
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...trainer import torch_distributed_zero_first
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, overwrite_cache=False, local_rank=-1,
|
||||
):
|
||||
assert os.path.isfile(file_path)
|
||||
|
||||
block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
|
||||
|
||||
directory, filename = os.path.split(file_path)
|
||||
cached_features_file = os.path.join(
|
||||
directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename,),
|
||||
)
|
||||
|
||||
with torch_distributed_zero_first(local_rank):
|
||||
# Make sure only the first process in distributed training processes the dataset,
|
||||
# and the others will use the cache.
|
||||
|
||||
if os.path.exists(cached_features_file) and not overwrite_cache:
|
||||
start = time.time()
|
||||
with open(cached_features_file, "rb") as handle:
|
||||
self.examples = pickle.load(handle)
|
||||
logger.info(
|
||||
f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
|
||||
)
|
||||
|
||||
else:
|
||||
logger.info(f"Creating features from dataset file at {directory}")
|
||||
|
||||
self.examples = []
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
text = f.read()
|
||||
|
||||
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
|
||||
|
||||
for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size
|
||||
self.examples.append(
|
||||
tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
|
||||
)
|
||||
# Note that we are losing the last truncated example here for the sake of simplicity (no padding)
|
||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
||||
# can change this behavior by adding (model specific) padding.
|
||||
|
||||
start = time.time()
|
||||
with open(cached_features_file, "wb") as handle:
|
||||
pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
logger.info(
|
||||
f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, i) -> torch.Tensor:
|
||||
return torch.tensor(self.examples[i], dtype=torch.long)
|
||||
|
||||
|
||||
class LineByLineTextDataset(Dataset):
|
||||
"""
|
||||
This will be superseded by a framework-agnostic approach
|
||||
soon.
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1):
|
||||
assert os.path.isfile(file_path)
|
||||
# Here, we do not cache the features, operating under the assumption
|
||||
# that we will soon use fast multithreaded tokenizers from the
|
||||
# `tokenizers` repo everywhere =)
|
||||
logger.info("Creating features from dataset file at %s", file_path)
|
||||
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
|
||||
|
||||
lines = lines[:50_000]
|
||||
batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)
|
||||
self.examples = batch_encoding["input_ids"]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, i) -> torch.Tensor:
|
||||
return torch.tensor(self.examples[i], dtype=torch.long)
|
||||
@@ -17,6 +17,7 @@
|
||||
|
||||
import logging
|
||||
import os
|
||||
from enum import Enum
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...file_utils import is_tf_available
|
||||
@@ -153,6 +154,11 @@ def _glue_convert_examples_to_features(
|
||||
return features
|
||||
|
||||
|
||||
class OutputMode(Enum):
|
||||
classification = "classification"
|
||||
regression = "regression"
|
||||
|
||||
|
||||
class MrpcProcessor(DataProcessor):
|
||||
"""Processor for the MRPC data set (GLUE version)."""
|
||||
|
||||
|
||||
@@ -14,13 +14,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import copy
|
||||
import csv
|
||||
import dataclasses
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from ...file_utils import is_tf_available, is_torch_available
|
||||
|
||||
@@ -28,7 +27,7 @@ from ...file_utils import is_tf_available, is_torch_available
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass(frozen=False)
|
||||
@dataclass
|
||||
class InputExample:
|
||||
"""
|
||||
A single training/test example for simple sequence classification.
|
||||
@@ -50,42 +49,37 @@ class InputExample:
|
||||
|
||||
def to_json_string(self):
|
||||
"""Serializes this instance to a JSON string."""
|
||||
return json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n"
|
||||
return json.dumps(dataclasses.asdict(self), indent=2) + "\n"
|
||||
|
||||
|
||||
class InputFeatures(object):
|
||||
@dataclass(frozen=True)
|
||||
class InputFeatures:
|
||||
"""
|
||||
A single set of features of data.
|
||||
Property names are the same names as the corresponding inputs to a model.
|
||||
|
||||
Args:
|
||||
input_ids: Indices of input sequence tokens in the vocabulary.
|
||||
attention_mask: Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
|
||||
token_type_ids: Segment token indices to indicate first and second portions of the inputs.
|
||||
label: Label corresponding to the input
|
||||
token_type_ids: (Optional) Segment token indices to indicate first and second
|
||||
portions of the inputs. Only some models use them.
|
||||
label: (Optional) Label corresponding to the input. Int for classification problems,
|
||||
float for regression problems.
|
||||
"""
|
||||
|
||||
def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None):
|
||||
self.input_ids = input_ids
|
||||
self.attention_mask = attention_mask
|
||||
self.token_type_ids = token_type_ids
|
||||
self.label = label
|
||||
|
||||
def __repr__(self):
|
||||
return str(self.to_json_string())
|
||||
|
||||
def to_dict(self):
|
||||
"""Serializes this instance to a Python dictionary."""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
return output
|
||||
input_ids: List[int]
|
||||
attention_mask: Optional[List[int]] = None
|
||||
token_type_ids: Optional[List[int]] = None
|
||||
label: Optional[Union[int, float]] = None
|
||||
|
||||
def to_json_string(self):
|
||||
"""Serializes this instance to a JSON string."""
|
||||
return json.dumps(self.to_dict(), sort_keys=True) + "\n"
|
||||
return json.dumps(dataclasses.asdict(self)) + "\n"
|
||||
|
||||
|
||||
class DataProcessor(object):
|
||||
class DataProcessor:
|
||||
"""Base class for data converters for sequence classification data sets."""
|
||||
|
||||
def get_example_from_tensor_dict(self, tensor_dict):
|
||||
|
||||
@@ -456,6 +456,11 @@ def get_from_cache(
|
||||
lock_path = cache_path + ".lock"
|
||||
with FileLock(lock_path):
|
||||
|
||||
# If the download just completed while the lock was activated.
|
||||
if os.path.exists(cache_path) and not force_download:
|
||||
# Even if returning early like here, the lock will be released.
|
||||
return cache_path
|
||||
|
||||
if resume_download:
|
||||
incomplete_path = cache_path + ".incomplete"
|
||||
|
||||
@@ -496,3 +501,50 @@ def get_from_cache(
|
||||
json.dump(meta, meta_file)
|
||||
|
||||
return cache_path
|
||||
|
||||
|
||||
class cached_property(property):
|
||||
"""
|
||||
Descriptor that mimics @property but caches output in member variable.
|
||||
|
||||
From tensorflow_datasets
|
||||
|
||||
Built-in in functools from Python 3.8.
|
||||
"""
|
||||
|
||||
def __get__(self, obj, objtype=None):
|
||||
# See docs.python.org/3/howto/descriptor.html#properties
|
||||
if obj is None:
|
||||
return self
|
||||
if self.fget is None:
|
||||
raise AttributeError("unreadable attribute")
|
||||
attr = "__cached_" + self.fget.__name__
|
||||
cached = getattr(obj, attr, None)
|
||||
if cached is None:
|
||||
cached = self.fget(obj)
|
||||
setattr(obj, attr, cached)
|
||||
return cached
|
||||
|
||||
|
||||
def torch_required(func):
|
||||
# Chose a different decorator name than in tests so it's clear they are not the same.
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
if is_torch_available():
|
||||
return func(*args, **kwargs)
|
||||
else:
|
||||
raise ImportError(f"Method `{func.__name__}` requires PyTorch.")
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def tf_required(func):
|
||||
# Chose a different decorator name than in tests so it's clear they are not the same.
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
if is_tf_available():
|
||||
return func(*args, **kwargs)
|
||||
else:
|
||||
raise ImportError(f"Method `{func.__name__}` requires TF.")
|
||||
|
||||
return wrapper
|
||||
|
||||
@@ -55,6 +55,7 @@ from .modeling_bart import (
|
||||
from .modeling_bert import (
|
||||
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
BertForMaskedLM,
|
||||
BertForMultipleChoice,
|
||||
BertForPreTraining,
|
||||
BertForQuestionAnswering,
|
||||
BertForSequenceClassification,
|
||||
@@ -64,6 +65,7 @@ from .modeling_bert import (
|
||||
from .modeling_camembert import (
|
||||
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
CamembertForMaskedLM,
|
||||
CamembertForMultipleChoice,
|
||||
CamembertForSequenceClassification,
|
||||
CamembertForTokenClassification,
|
||||
CamembertModel,
|
||||
@@ -96,6 +98,7 @@ from .modeling_openai import OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTL
|
||||
from .modeling_roberta import (
|
||||
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
RobertaForMaskedLM,
|
||||
RobertaForMultipleChoice,
|
||||
RobertaForQuestionAnswering,
|
||||
RobertaForSequenceClassification,
|
||||
RobertaForTokenClassification,
|
||||
@@ -114,12 +117,14 @@ from .modeling_xlm import (
|
||||
from .modeling_xlm_roberta import (
|
||||
XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
XLMRobertaForMaskedLM,
|
||||
XLMRobertaForMultipleChoice,
|
||||
XLMRobertaForSequenceClassification,
|
||||
XLMRobertaForTokenClassification,
|
||||
XLMRobertaModel,
|
||||
)
|
||||
from .modeling_xlnet import (
|
||||
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
XLNetForMultipleChoice,
|
||||
XLNetForQuestionAnsweringSimple,
|
||||
XLNetForSequenceClassification,
|
||||
XLNetForTokenClassification,
|
||||
@@ -259,7 +264,18 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
|
||||
)
|
||||
|
||||
|
||||
class AutoModel(object):
|
||||
MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
|
||||
[
|
||||
(CamembertConfig, CamembertForMultipleChoice),
|
||||
(XLMRobertaConfig, XLMRobertaForMultipleChoice),
|
||||
(RobertaConfig, RobertaForMultipleChoice),
|
||||
(BertConfig, BertForMultipleChoice),
|
||||
(XLNetConfig, XLNetForMultipleChoice),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class AutoModel:
|
||||
r"""
|
||||
:class:`~transformers.AutoModel` is a generic model class
|
||||
that will be instantiated as one of the base model classes of the library
|
||||
@@ -410,7 +426,7 @@ class AutoModel(object):
|
||||
)
|
||||
|
||||
|
||||
class AutoModelForPreTraining(object):
|
||||
class AutoModelForPreTraining:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelForPreTraining` is a generic model class
|
||||
that will be instantiated as one of the model classes of the library -with the architecture used for pretraining this model– when created with the `AutoModelForPreTraining.from_pretrained(pretrained_model_name_or_path)`
|
||||
@@ -552,7 +568,7 @@ class AutoModelForPreTraining(object):
|
||||
)
|
||||
|
||||
|
||||
class AutoModelWithLMHead(object):
|
||||
class AutoModelWithLMHead:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelWithLMHead` is a generic model class
|
||||
that will be instantiated as one of the language modeling model classes of the library
|
||||
@@ -696,7 +712,7 @@ class AutoModelWithLMHead(object):
|
||||
)
|
||||
|
||||
|
||||
class AutoModelForSequenceClassification(object):
|
||||
class AutoModelForSequenceClassification:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelForSequenceClassification` is a generic model class
|
||||
that will be instantiated as one of the sequence classification model classes of the library
|
||||
@@ -843,7 +859,7 @@ class AutoModelForSequenceClassification(object):
|
||||
)
|
||||
|
||||
|
||||
class AutoModelForQuestionAnswering(object):
|
||||
class AutoModelForQuestionAnswering:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelForQuestionAnswering` is a generic model class
|
||||
that will be instantiated as one of the question answering model classes of the library
|
||||
@@ -1126,3 +1142,55 @@ class AutoModelForTokenClassification:
|
||||
", ".join(c.__name__ for c in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys()),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class AutoModelForMultipleChoice:
|
||||
r"""
|
||||
:class:`~transformers.AutoModelForMultipleChoice` is a generic model class
|
||||
that will be instantiated as one of the multiple choice model classes of the library
|
||||
when created with the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throws an error).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
raise EnvironmentError(
|
||||
"AutoModelForMultipleChoice is designed to be instantiated "
|
||||
"using the `AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path)` or "
|
||||
"`AutoModelForMultipleChoice.from_config(config)` methods."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
for config_class, model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items():
|
||||
if isinstance(config, config_class):
|
||||
return model_class(config)
|
||||
|
||||
raise ValueError(
|
||||
"Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
|
||||
"Model type should be one of {}.".format(
|
||||
config.__class__,
|
||||
cls.__name__,
|
||||
", ".join(c.__name__ for c in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()),
|
||||
)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
config = kwargs.pop("config", None)
|
||||
if not isinstance(config, PretrainedConfig):
|
||||
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
for config_class, model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items():
|
||||
if isinstance(config, config_class):
|
||||
return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
|
||||
|
||||
raise ValueError(
|
||||
"Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
|
||||
"Model type should be one of {}.".format(
|
||||
config.__class__,
|
||||
cls.__name__,
|
||||
", ".join(c.__name__ for c in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.keys()),
|
||||
)
|
||||
)
|
||||
|
||||
558
src/transformers/trainer.py
Normal file
558
src/transformers/trainer.py
Normal file
@@ -0,0 +1,558 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import shutil
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.utils.data.dataloader import DataLoader
|
||||
from torch.utils.data.dataset import Dataset
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from torch.utils.data.sampler import RandomSampler
|
||||
from tqdm import tqdm, trange
|
||||
|
||||
from .data.data_collator import DataCollator, DefaultDataCollator
|
||||
from .modeling_utils import PreTrainedModel
|
||||
from .optimization import AdamW, get_linear_schedule_with_warmup
|
||||
from .training_args import TrainingArguments
|
||||
|
||||
|
||||
try:
|
||||
from apex import amp
|
||||
|
||||
_has_apex = True
|
||||
except ImportError:
|
||||
_has_apex = False
|
||||
|
||||
|
||||
def is_apex_available():
|
||||
return _has_apex
|
||||
|
||||
|
||||
try:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
_has_tensorboard = True
|
||||
except ImportError:
|
||||
try:
|
||||
from tensorboardX import SummaryWriter
|
||||
|
||||
_has_tensorboard = True
|
||||
except ImportError:
|
||||
_has_tensorboard = False
|
||||
|
||||
|
||||
def is_tensorboard_available():
|
||||
return _has_tensorboard
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def set_seed(seed: int):
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
# ^^ safe to call this function even if cuda is not available
|
||||
|
||||
|
||||
@contextmanager
|
||||
def torch_distributed_zero_first(local_rank: int):
|
||||
"""
|
||||
Decorator to make all processes in distributed training wait for the first one (locally) to do something.
|
||||
"""
|
||||
if local_rank not in [-1, 0]:
|
||||
torch.distributed.barrier()
|
||||
yield
|
||||
if local_rank == 0:
|
||||
torch.distributed.barrier()
|
||||
|
||||
|
||||
class EvalPrediction(NamedTuple):
|
||||
"""
|
||||
Evaluation output (always contains labels), to be used
|
||||
to compute metrics.
|
||||
"""
|
||||
|
||||
predictions: np.ndarray
|
||||
label_ids: np.ndarray
|
||||
|
||||
|
||||
class PredictionOutput(NamedTuple):
|
||||
predictions: np.ndarray
|
||||
label_ids: Optional[np.ndarray]
|
||||
metrics: Optional[Dict[str, float]]
|
||||
|
||||
|
||||
class TrainOutput(NamedTuple):
|
||||
global_step: int
|
||||
training_loss: float
|
||||
|
||||
|
||||
PREFIX_CHECKPOINT_DIR = "checkpoint"
|
||||
|
||||
|
||||
class Trainer:
|
||||
"""
|
||||
Trainer is a simple but feature-complete training and eval loop for PyTorch,
|
||||
optimized for Transformers.
|
||||
"""
|
||||
|
||||
model: PreTrainedModel
|
||||
args: TrainingArguments
|
||||
data_collator: DataCollator
|
||||
train_dataset: Optional[Dataset]
|
||||
eval_dataset: Optional[Dataset]
|
||||
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None
|
||||
prediction_loss_only: bool
|
||||
tb_writer: Optional["SummaryWriter"] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: PreTrainedModel,
|
||||
args: TrainingArguments,
|
||||
data_collator: Optional[DataCollator] = None,
|
||||
train_dataset: Optional[Dataset] = None,
|
||||
eval_dataset: Optional[Dataset] = None,
|
||||
compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
|
||||
prediction_loss_only=False,
|
||||
):
|
||||
"""
|
||||
Trainer is a simple but feature-complete training and eval loop for PyTorch,
|
||||
optimized for Transformers.
|
||||
|
||||
Args:
|
||||
prediction_loss_only:
|
||||
(Optional) in evaluation and prediction, only return the loss
|
||||
"""
|
||||
self.model = model
|
||||
self.args = args
|
||||
if data_collator is not None:
|
||||
self.data_collator = data_collator
|
||||
else:
|
||||
self.data_collator = DefaultDataCollator()
|
||||
self.train_dataset = train_dataset
|
||||
self.eval_dataset = eval_dataset
|
||||
self.compute_metrics = compute_metrics
|
||||
self.prediction_loss_only = prediction_loss_only
|
||||
if is_tensorboard_available() and self.args.local_rank in [-1, 0]:
|
||||
self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir)
|
||||
if not is_tensorboard_available():
|
||||
logger.warning(
|
||||
"You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it."
|
||||
)
|
||||
set_seed(self.args.seed)
|
||||
# Create output directory if needed
|
||||
if self.args.local_rank in [-1, 0]:
|
||||
os.makedirs(self.args.output_dir, exist_ok=True)
|
||||
|
||||
def get_train_dataloader(self) -> DataLoader:
|
||||
if self.train_dataset is None:
|
||||
raise ValueError("Trainer: training requires a train_dataset.")
|
||||
train_sampler = (
|
||||
RandomSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset)
|
||||
)
|
||||
return DataLoader(
|
||||
self.train_dataset,
|
||||
batch_size=self.args.train_batch_size,
|
||||
sampler=train_sampler,
|
||||
collate_fn=self.data_collator.collate_batch,
|
||||
)
|
||||
|
||||
def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
|
||||
if eval_dataset is None and self.eval_dataset is None:
|
||||
raise ValueError("Trainer: evaluation requires an eval_dataset.")
|
||||
return DataLoader(
|
||||
eval_dataset if eval_dataset is not None else self.eval_dataset,
|
||||
batch_size=self.args.eval_batch_size,
|
||||
shuffle=False,
|
||||
collate_fn=self.data_collator.collate_batch,
|
||||
)
|
||||
|
||||
def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
|
||||
# We use the same batch_size as for eval.
|
||||
return DataLoader(
|
||||
test_dataset,
|
||||
batch_size=self.args.eval_batch_size,
|
||||
shuffle=False,
|
||||
collate_fn=self.data_collator.collate_batch,
|
||||
)
|
||||
|
||||
def get_optimizers(
|
||||
self, num_training_steps: int
|
||||
) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]:
|
||||
# Prepare optimizer and schedule (linear warmup and decay)
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": self.args.weight_decay,
|
||||
},
|
||||
{
|
||||
"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps
|
||||
)
|
||||
return optimizer, scheduler
|
||||
|
||||
def train(self, model_path: Optional[str] = None):
|
||||
"""
|
||||
Main training entry point.
|
||||
|
||||
Args:
|
||||
model_path:
|
||||
(Optional) Local path to model if model to train has been instantiated from a local path
|
||||
If present, we will try reloading the optimizer/scheduler states from there.
|
||||
"""
|
||||
train_dataloader = self.get_train_dataloader()
|
||||
|
||||
if self.args.max_steps > 0:
|
||||
t_total = self.args.max_steps
|
||||
num_train_epochs = (
|
||||
self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
|
||||
)
|
||||
else:
|
||||
t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs)
|
||||
num_train_epochs = self.args.num_train_epochs
|
||||
|
||||
optimizer, scheduler = self.get_optimizers(num_training_steps=t_total)
|
||||
|
||||
# Check if saved optimizer or scheduler states exist
|
||||
if (
|
||||
model_path is not None
|
||||
and os.path.isfile(os.path.join(model_path, "optimizer.pt"))
|
||||
and os.path.isfile(os.path.join(model_path, "scheduler.pt"))
|
||||
):
|
||||
# Load in optimizer and scheduler states
|
||||
optimizer.load_state_dict(torch.load(os.path.join(model_path, "optimizer.pt")))
|
||||
scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt")))
|
||||
|
||||
model = self.model
|
||||
model.to(self.args.device)
|
||||
if self.args.fp16:
|
||||
if not is_apex_available():
|
||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level)
|
||||
|
||||
# multi-gpu training (should be after apex fp16 initialization)
|
||||
if self.args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if self.args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model,
|
||||
device_ids=[self.args.local_rank],
|
||||
output_device=self.args.local_rank,
|
||||
find_unused_parameters=True,
|
||||
)
|
||||
|
||||
if self.tb_writer is not None:
|
||||
self.tb_writer.add_text("args", self.args.to_json_string())
|
||||
|
||||
# Train!
|
||||
logger.info("***** Running training *****")
|
||||
logger.info(" Num examples = %d", len(train_dataloader.dataset))
|
||||
logger.info(" Num Epochs = %d", num_train_epochs)
|
||||
logger.info(" Instantaneous batch size per GPU = %d", self.args.per_gpu_train_batch_size)
|
||||
logger.info(
|
||||
" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
||||
self.args.train_batch_size
|
||||
* self.args.gradient_accumulation_steps
|
||||
* (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1),
|
||||
)
|
||||
logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
|
||||
logger.info(" Total optimization steps = %d", t_total)
|
||||
|
||||
global_step = 0
|
||||
epochs_trained = 0
|
||||
steps_trained_in_current_epoch = 0
|
||||
# Check if continuing training from a checkpoint
|
||||
if model_path is not None:
|
||||
# set global_step to global_step of last saved checkpoint from model path
|
||||
try:
|
||||
global_step = int(model_path.split("-")[-1].split("/")[0])
|
||||
epochs_trained = global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps)
|
||||
steps_trained_in_current_epoch = global_step % (
|
||||
len(train_dataloader) // self.args.gradient_accumulation_steps
|
||||
)
|
||||
|
||||
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
|
||||
logger.info(" Continuing training from epoch %d", epochs_trained)
|
||||
logger.info(" Continuing training from global step %d", global_step)
|
||||
logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
|
||||
except ValueError:
|
||||
global_step = 0
|
||||
logger.info(" Starting fine-tuning.")
|
||||
|
||||
tr_loss = 0.0
|
||||
logging_loss = 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(num_train_epochs), desc="Epoch", disable=self.args.local_rank not in [-1, 0],
|
||||
)
|
||||
for epoch in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args.local_rank not in [-1, 0])
|
||||
for step, inputs in enumerate(epoch_iterator):
|
||||
|
||||
# Skip past any already trained steps if resuming training
|
||||
if steps_trained_in_current_epoch > 0:
|
||||
steps_trained_in_current_epoch -= 1
|
||||
continue
|
||||
|
||||
tr_loss += self._training_step(model, inputs, optimizer)
|
||||
|
||||
if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
|
||||
# last step in epoch but step is always smaller than gradient_accumulation_steps
|
||||
len(epoch_iterator) <= self.args.gradient_accumulation_steps
|
||||
and (step + 1) == len(epoch_iterator)
|
||||
):
|
||||
if self.args.fp16:
|
||||
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm)
|
||||
else:
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm)
|
||||
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
model.zero_grad()
|
||||
global_step += 1
|
||||
|
||||
if self.args.local_rank in [-1, 0]:
|
||||
if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or (
|
||||
global_step == 1 and self.args.logging_first_step
|
||||
):
|
||||
logs = {}
|
||||
if self.args.evaluate_during_training:
|
||||
results = self.evaluate()
|
||||
for key, value in results.items():
|
||||
eval_key = "eval_{}".format(key)
|
||||
logs[eval_key] = value
|
||||
|
||||
loss_scalar = (tr_loss - logging_loss) / self.args.logging_steps
|
||||
learning_rate_scalar = scheduler.get_last_lr()[0]
|
||||
logs["learning_rate"] = learning_rate_scalar
|
||||
logs["loss"] = loss_scalar
|
||||
logging_loss = tr_loss
|
||||
|
||||
if self.tb_writer:
|
||||
for k, v in logs.items():
|
||||
self.tb_writer.add_scalar(k, v, global_step)
|
||||
epoch_iterator.write(json.dumps({**logs, **{"step": global_step}}))
|
||||
|
||||
if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
|
||||
# In all cases (even distributed/parallel), self.model is always a reference
|
||||
# to the model we want to save.
|
||||
if hasattr(model, "module"):
|
||||
assert model.module is self.model
|
||||
else:
|
||||
assert model is self.model
|
||||
# Save model checkpoint
|
||||
output_dir = os.path.join(self.args.output_dir, f"checkpoint-{global_step}")
|
||||
self.save_model(output_dir)
|
||||
self._rotate_checkpoints()
|
||||
torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
|
||||
torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
|
||||
logger.info("Saving optimizer and scheduler states to %s", output_dir)
|
||||
|
||||
if self.args.max_steps > 0 and global_step > self.args.max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
if self.args.max_steps > 0 and global_step > self.args.max_steps:
|
||||
train_iterator.close()
|
||||
break
|
||||
|
||||
if self.tb_writer:
|
||||
self.tb_writer.close()
|
||||
|
||||
logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
|
||||
return TrainOutput(global_step, tr_loss / global_step)
|
||||
|
||||
def _training_step(
|
||||
self, model: nn.Module, inputs: Dict[str, torch.Tensor], optimizer: torch.optim.Optimizer
|
||||
) -> float:
|
||||
model.train()
|
||||
for k, v in inputs.items():
|
||||
inputs[k] = v.to(self.args.device)
|
||||
|
||||
outputs = model(**inputs)
|
||||
loss = outputs[0] # model outputs are always tuple in transformers (see doc)
|
||||
|
||||
if self.args.n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
||||
if self.args.gradient_accumulation_steps > 1:
|
||||
loss = loss / self.args.gradient_accumulation_steps
|
||||
|
||||
if self.args.fp16:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
|
||||
return loss.item()
|
||||
|
||||
def is_world_master(self) -> bool:
|
||||
"""
|
||||
This will be True only in one process, even in distributed mode,
|
||||
even when training on multiple machines.
|
||||
"""
|
||||
return self.args.local_rank == -1 or torch.distributed.get_rank() == 0
|
||||
|
||||
def save_model(self, output_dir: Optional[str] = None):
|
||||
"""
|
||||
Saving best-practices: if you use default names for the model,
|
||||
you can reload it using from_pretrained().
|
||||
|
||||
Will only save from the master process.
|
||||
"""
|
||||
if self.is_world_master():
|
||||
self._save(output_dir)
|
||||
|
||||
def _save(self, output_dir: Optional[str] = None):
|
||||
output_dir = output_dir if output_dir is not None else self.args.output_dir
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
logger.info("Saving model checkpoint to %s", output_dir)
|
||||
# Save a trained model and configuration using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
if not isinstance(self.model, PreTrainedModel):
|
||||
raise ValueError("Trainer.model appears to not be a PreTrainedModel")
|
||||
self.model.save_pretrained(output_dir)
|
||||
|
||||
# Good practice: save your training arguments together with the trained model
|
||||
torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
|
||||
|
||||
def _sorted_checkpoints(self, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False) -> List[str]:
|
||||
ordering_and_checkpoint_path = []
|
||||
|
||||
glob_checkpoints = Path(self.args.output_dir).glob(f"{checkpoint_prefix}-*")
|
||||
|
||||
for path in glob_checkpoints:
|
||||
if use_mtime:
|
||||
ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
|
||||
else:
|
||||
regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
|
||||
if regex_match and regex_match.groups():
|
||||
ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
|
||||
|
||||
checkpoints_sorted = sorted(ordering_and_checkpoint_path)
|
||||
checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
|
||||
return checkpoints_sorted
|
||||
|
||||
def _rotate_checkpoints(self, use_mtime=False) -> None:
|
||||
if not self.args.save_total_limit:
|
||||
return
|
||||
if self.args.save_total_limit <= 0:
|
||||
return
|
||||
|
||||
# Check if we should delete older checkpoint(s)
|
||||
checkpoints_sorted = self._sorted_checkpoints(use_mtime=use_mtime)
|
||||
if len(checkpoints_sorted) <= self.args.save_total_limit:
|
||||
return
|
||||
|
||||
number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - self.args.save_total_limit)
|
||||
checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
|
||||
for checkpoint in checkpoints_to_be_deleted:
|
||||
logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
|
||||
shutil.rmtree(checkpoint)
|
||||
|
||||
def evaluate(
|
||||
self, eval_dataset: Optional[Dataset] = None, prediction_loss_only: Optional[bool] = None
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Run evaluation and return metrics.
|
||||
|
||||
The calling script will be responsible for providing a method to compute metrics, as they are
|
||||
task-dependent.
|
||||
|
||||
Args:
|
||||
eval_dataset: (Optional) Pass a dataset if you wish to override
|
||||
the one on the instance.
|
||||
Returns:
|
||||
A dict containing:
|
||||
- the eval loss
|
||||
- the potential metrics computed from the predictions
|
||||
"""
|
||||
eval_dataloader = self.get_eval_dataloader(eval_dataset)
|
||||
|
||||
output = self._prediction_loop(eval_dataloader, description="Evaluation")
|
||||
return output.metrics
|
||||
|
||||
def predict(self, test_dataset: Dataset) -> PredictionOutput:
|
||||
"""
|
||||
Run prediction and return predictions and potential metrics.
|
||||
|
||||
Depending on the dataset and your use case, your test dataset may contain labels.
|
||||
In that case, this method will also return metrics, like in evaluate().
|
||||
"""
|
||||
test_dataloader = self.get_test_dataloader(test_dataset)
|
||||
return self._prediction_loop(test_dataloader, description="Prediction")
|
||||
|
||||
def _prediction_loop(
|
||||
self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None
|
||||
) -> PredictionOutput:
|
||||
"""
|
||||
Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
|
||||
|
||||
Works both with or without labels.
|
||||
"""
|
||||
|
||||
prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only
|
||||
|
||||
# multi-gpu eval
|
||||
if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel):
|
||||
model = torch.nn.DataParallel(self.model)
|
||||
else:
|
||||
model = self.model
|
||||
model.to(self.args.device)
|
||||
|
||||
logger.info("***** Running %s *****", description)
|
||||
logger.info(" Num examples = %d", len(dataloader.dataset))
|
||||
logger.info(" Batch size = %d", dataloader.batch_size)
|
||||
eval_losses: List[float] = []
|
||||
preds: np.ndarray = None
|
||||
label_ids: np.ndarray = None
|
||||
model.eval()
|
||||
|
||||
for inputs in tqdm(dataloader, desc=description):
|
||||
has_labels = any(inputs.get(k) is not None for k in ["labels", "masked_lm_labels"])
|
||||
|
||||
for k, v in inputs.items():
|
||||
inputs[k] = v.to(self.args.device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
if has_labels:
|
||||
step_eval_loss, logits = outputs[:2]
|
||||
eval_losses += [step_eval_loss.mean().item()]
|
||||
else:
|
||||
logits = outputs[0]
|
||||
|
||||
if not prediction_loss_only:
|
||||
if preds is None:
|
||||
preds = logits.detach().cpu().numpy()
|
||||
else:
|
||||
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
|
||||
if inputs.get("labels") is not None:
|
||||
if label_ids is None:
|
||||
label_ids = inputs["labels"].detach().cpu().numpy()
|
||||
else:
|
||||
label_ids = np.append(label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
|
||||
|
||||
if self.compute_metrics is not None and preds is not None and label_ids is not None:
|
||||
metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
|
||||
else:
|
||||
metrics = {}
|
||||
if len(eval_losses) > 0:
|
||||
metrics["loss"] = np.mean(eval_losses)
|
||||
|
||||
return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
|
||||
@@ -1,5 +1,17 @@
|
||||
import dataclasses
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from .file_utils import cached_property, is_torch_available, torch_required
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -22,6 +34,7 @@ class TrainingArguments:
|
||||
|
||||
do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
|
||||
do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
|
||||
do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
|
||||
evaluate_during_training: bool = field(
|
||||
default=False, metadata={"help": "Run evaluation during training at each logging step."}
|
||||
)
|
||||
@@ -44,6 +57,8 @@ class TrainingArguments:
|
||||
)
|
||||
warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
|
||||
|
||||
logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."})
|
||||
logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
|
||||
logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
|
||||
save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
|
||||
save_total_limit: Optional[int] = field(
|
||||
@@ -52,12 +67,6 @@ class TrainingArguments:
|
||||
"help": "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default"
|
||||
},
|
||||
)
|
||||
eval_all_checkpoints: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
|
||||
},
|
||||
)
|
||||
no_cuda: bool = field(default=False, metadata={"help": "Avoid using CUDA even if it is available"})
|
||||
seed: int = field(default=42, metadata={"help": "random seed for initialization"})
|
||||
|
||||
@@ -73,3 +82,47 @@ class TrainingArguments:
|
||||
},
|
||||
)
|
||||
local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
|
||||
|
||||
@property
|
||||
def train_batch_size(self) -> int:
|
||||
return self.per_gpu_train_batch_size * max(1, self.n_gpu)
|
||||
|
||||
@property
|
||||
def eval_batch_size(self) -> int:
|
||||
return self.per_gpu_eval_batch_size * max(1, self.n_gpu)
|
||||
|
||||
@cached_property
|
||||
@torch_required
|
||||
def _setup_devices(self) -> Tuple["torch.device", int]:
|
||||
logger.info("PyTorch: setting up devices")
|
||||
if self.no_cuda:
|
||||
device = torch.device("cpu")
|
||||
n_gpu = 0
|
||||
elif self.local_rank == -1:
|
||||
# if n_gpu is > 1 we'll use nn.DataParallel.
|
||||
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
else:
|
||||
# Here, we'll use torch.distributed.
|
||||
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
device = torch.device("cuda", self.local_rank)
|
||||
n_gpu = 1
|
||||
return device, n_gpu
|
||||
|
||||
@property
|
||||
@torch_required
|
||||
def device(self) -> "torch.device":
|
||||
return self._setup_devices[0]
|
||||
|
||||
@property
|
||||
@torch_required
|
||||
def n_gpu(self):
|
||||
return self._setup_devices[1]
|
||||
|
||||
def to_json_string(self):
|
||||
"""
|
||||
Serializes this instance to a JSON string.
|
||||
"""
|
||||
return json.dumps(dataclasses.asdict(self), indent=2)
|
||||
|
||||
Reference in New Issue
Block a user