[trainer] remove --model_parallel (#9451)

* fix bad merge - dropped code

* remove --model_parallel

* Deal with TrainingArguments

* Use a private attr and fix batch sizes

* fix _n_gpu

* add is_parallel helper wrapper

* fix attribute

* introduce a new attribute is_model_parallel

* docs

* docs

* Put back init False and rearrange doc

* Ignore non-init args in HFArgumentParser

Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
This commit is contained in:
Stas Bekman
2021-01-11 06:39:28 -08:00
committed by GitHub
parent 6f63501383
commit 33b7422839
3 changed files with 30 additions and 35 deletions

View File

@@ -210,9 +210,6 @@ class TrainingArguments:
- :obj:`True` if :obj:`metric_for_best_model` is set to a value that isn't :obj:`"loss"` or
:obj:`"eval_loss"`.
- :obj:`False` if :obj:`metric_for_best_model` is not set, or set to :obj:`"loss"` or :obj:`"eval_loss"`.
model_parallel (:obj:`bool`, `optional`, defaults to :obj:`False`):
If the model supports model parallelism and there is more than one device, whether to use model parallelism
to distribute the model's modules across devices or not.
ignore_skip_data (:obj:`bool`, `optional`, defaults to :obj:`False`):
When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
stage as in the previous training. If set to :obj:`True`, the training will begin faster (as that skipping
@@ -245,15 +242,6 @@ class TrainingArguments:
do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
do_eval: bool = field(default=None, metadata={"help": "Whether to run eval on the dev set."})
do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
model_parallel: bool = field(
default=False,
metadata={
"help": (
"If there are more than one devices, whether to use model parallelism to distribute the "
"model's modules across devices."
)
},
)
evaluation_strategy: EvaluationStrategy = field(
default="no",
metadata={"help": "The evaluation strategy to use."},
@@ -410,6 +398,7 @@ class TrainingArguments:
default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."}
)
adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace Adam by Adafactor."})
_n_gpu: int = field(init=False, repr=False, default=0)
def __post_init__(self):
if self.disable_tqdm is None:
@@ -430,6 +419,7 @@ class TrainingArguments:
if is_torch_available() and self.device.type != "cuda" and self.fp16:
raise ValueError("Mixed precision training with AMP or APEX (`--fp16`) can only be used on CUDA devices.")
self._n_gpu = torch.cuda.device_count()
def __repr__(self):
# We override the default repr to remove deprecated arguments from the repr. This method should be removed once
@@ -451,10 +441,7 @@ class TrainingArguments:
"version. Using `--per_device_train_batch_size` is preferred."
)
per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size
if not self.model_parallel:
train_batch_size = per_device_batch_size * max(1, self.n_gpu)
else:
train_batch_size = per_device_batch_size
train_batch_size = per_device_batch_size * max(1, self.n_gpu)
return train_batch_size
@property
@@ -468,10 +455,7 @@ class TrainingArguments:
"version. Using `--per_device_eval_batch_size` is preferred."
)
per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size
if not self.model_parallel:
eval_batch_size = per_device_batch_size * max(1, self.n_gpu)
else:
eval_batch_size = per_device_batch_size
eval_batch_size = per_device_batch_size * max(1, self.n_gpu)
return eval_batch_size
@cached_property
@@ -492,7 +476,7 @@ class TrainingArguments:
# GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
# will use the first GPU in that env, i.e. GPU#1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu = self._n_gpu
else:
# Here, we'll use torch.distributed.
# Initializes the distributed backend which will take care of synchronizing nodes/GPUs