Trainer (#3800)

* doc * [tests] Add sample files for a regression task * [HUGE] Trainer * Feedback from @sshleifer * Feedback from @thomwolf + logging tweak * [file_utils] when downloading concurrently, get_from_cache will use the cached file for subsequent processes * [glue] Use default max_seq_length of 128 like before * [glue] move DataTrainingArguments around * [ner] Change interface of InputExample, and align run_{tf,pl} * Re-align the pl scripts a little bit * ner * [ner] Add integration test * Fix language_modeling with API tweak * [ci] Tweak loss target * Don't break console output * amp.initialize: model must be on right device before * [multiple-choice] update for Trainer * Re-align to 827d6d6ef0
2020-04-21 20:11:56 -04:00
parent eb5601b0a5
commit dd9d483d03
41 changed files with 2682 additions and 2567 deletions
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1,5 +1,17 @@
+import dataclasses
+import json
+import logging
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import Optional, Tuple
+
+from .file_utils import cached_property, is_torch_available, torch_required
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.getLogger(__name__)


@dataclass
@@ -22,6 +34,7 @@ class TrainingArguments:

    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
    do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
+    do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
    evaluate_during_training: bool = field(
        default=False, metadata={"help": "Run evaluation during training at each logging step."}
    )
@@ -44,6 +57,8 @@ class TrainingArguments:
    )
    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})

+    logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."})
+    logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"})
    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
    save_total_limit: Optional[int] = field(
@@ -52,12 +67,6 @@ class TrainingArguments:
            "help": "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default"
        },
    )
-    eval_all_checkpoints: bool = field(
-        default=False,
-        metadata={
-            "help": "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
-        },
-    )
    no_cuda: bool = field(default=False, metadata={"help": "Avoid using CUDA even if it is available"})
    seed: int = field(default=42, metadata={"help": "random seed for initialization"})

@@ -73,3 +82,47 @@ class TrainingArguments:
        },
    )
    local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
+
+    @property
+    def train_batch_size(self) -> int:
+        return self.per_gpu_train_batch_size * max(1, self.n_gpu)
+
+    @property
+    def eval_batch_size(self) -> int:
+        return self.per_gpu_eval_batch_size * max(1, self.n_gpu)
+
+    @cached_property
+    @torch_required
+    def _setup_devices(self) -> Tuple["torch.device", int]:
+        logger.info("PyTorch: setting up devices")
+        if self.no_cuda:
+            device = torch.device("cpu")
+            n_gpu = 0
+        elif self.local_rank == -1:
+            # if n_gpu is > 1 we'll use nn.DataParallel.
+            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            n_gpu = torch.cuda.device_count()
+        else:
+            # Here, we'll use torch.distributed.
+            # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+            torch.distributed.init_process_group(backend="nccl")
+            device = torch.device("cuda", self.local_rank)
+            n_gpu = 1
+        return device, n_gpu
+
+    @property
+    @torch_required
+    def device(self) -> "torch.device":
+        return self._setup_devices[0]
+
+    @property
+    @torch_required
+    def n_gpu(self):
+        return self._setup_devices[1]
+
+    def to_json_string(self):
+        """
+        Serializes this instance to a JSON string.
+        """
+        return json.dumps(dataclasses.asdict(self), indent=2)