Experimental support for fairscale ShardedDDP (#9139)

* Experimental stupport for fairscale ShardedDDP * Add import error if fairscale not available * Address review comments * Fix seq2seq trainer
2020-12-16 13:47:48 -05:00
parent 1c1a2ffbff
commit 9a67185344
4 changed files with 78 additions and 19 deletions
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -215,6 +215,9 @@ class TrainingArguments:
            The backend to use for mixed precision training. Must be one of :obj:`"auto"`, :obj:`"amp"` or
            :obj:`"apex"`. :obj:`"auto"` will use AMP or APEX depending on the PyTorch version detected, while the
            other choices will force the requested backend.
+        sharded_ddp (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Use Sharded DDP training from `FairScale <https://github.com/facebookresearch/fairscale>`__ (in distributed
+            training only). This is an experimental feature.
    """

    output_dir: str = field(
@@ -386,6 +389,10 @@ class TrainingArguments:
        default="auto",
        metadata={"help": "The backend to be used for mixed precision.", "choices": ["auto", "amp", "apex"]},
    )
+    sharded_ddp: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use sharded DDP training (in distributed training only)."},
+    )

    def __post_init__(self):
        if self.disable_tqdm is None: