Trainer (#3800)
* doc
* [tests] Add sample files for a regression task
* [HUGE] Trainer
* Feedback from @sshleifer
* Feedback from @thomwolf + logging tweak
* [file_utils] when downloading concurrently, get_from_cache will use the cached file for subsequent processes
* [glue] Use default max_seq_length of 128 like before
* [glue] move DataTrainingArguments around
* [ner] Change interface of InputExample, and align run_{tf,pl}
* Re-align the pl scripts a little bit
* ner
* [ner] Add integration test
* Fix language_modeling with API tweak
* [ci] Tweak loss target
* Don't break console output
* amp.initialize: model must be on right device before
* [multiple-choice] update for Trainer
* Re-align to 827d6d6ef0
This commit is contained in:
@@ -1,5 +1,17 @@
|
||||
import dataclasses
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from .file_utils import cached_property, is_torch_available, torch_required
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -22,6 +34,7 @@ class TrainingArguments:
|
||||
|
||||
do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
|
||||
do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
|
||||
do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
|
||||
evaluate_during_training: bool = field(
|
||||
default=False, metadata={"help": "Run evaluation during training at each logging step."}
|
||||
)
|
||||
@@ -44,6 +57,8 @@ class TrainingArguments:
|
||||
)
|
||||
warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
|
||||
|
||||
logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."})
|
||||
logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
|
||||
logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
|
||||
save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
|
||||
save_total_limit: Optional[int] = field(
|
||||
@@ -52,12 +67,6 @@ class TrainingArguments:
|
||||
"help": "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default"
|
||||
},
|
||||
)
|
||||
eval_all_checkpoints: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
|
||||
},
|
||||
)
|
||||
no_cuda: bool = field(default=False, metadata={"help": "Avoid using CUDA even if it is available"})
|
||||
seed: int = field(default=42, metadata={"help": "random seed for initialization"})
|
||||
|
||||
@@ -73,3 +82,47 @@ class TrainingArguments:
|
||||
},
|
||||
)
|
||||
local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
|
||||
|
||||
@property
|
||||
def train_batch_size(self) -> int:
|
||||
return self.per_gpu_train_batch_size * max(1, self.n_gpu)
|
||||
|
||||
@property
|
||||
def eval_batch_size(self) -> int:
|
||||
return self.per_gpu_eval_batch_size * max(1, self.n_gpu)
|
||||
|
||||
@cached_property
|
||||
@torch_required
|
||||
def _setup_devices(self) -> Tuple["torch.device", int]:
|
||||
logger.info("PyTorch: setting up devices")
|
||||
if self.no_cuda:
|
||||
device = torch.device("cpu")
|
||||
n_gpu = 0
|
||||
elif self.local_rank == -1:
|
||||
# if n_gpu is > 1 we'll use nn.DataParallel.
|
||||
# If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
else:
|
||||
# Here, we'll use torch.distributed.
|
||||
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
device = torch.device("cuda", self.local_rank)
|
||||
n_gpu = 1
|
||||
return device, n_gpu
|
||||
|
||||
@property
|
||||
@torch_required
|
||||
def device(self) -> "torch.device":
|
||||
return self._setup_devices[0]
|
||||
|
||||
@property
|
||||
@torch_required
|
||||
def n_gpu(self):
|
||||
return self._setup_devices[1]
|
||||
|
||||
def to_json_string(self):
|
||||
"""
|
||||
Serializes this instance to a JSON string.
|
||||
"""
|
||||
return json.dumps(dataclasses.asdict(self), indent=2)
|
||||
|
||||
Reference in New Issue
Block a user