[TF 2.2 compat] use tf.VariableAggregation.ONLY_FIRST_REPLICA (#4283)

* Fix the issue to properly run the accumulator with TF 2.2 * Apply style * Fix training_args_tf for TF 2.2 * Fix the TF training args when only one GPU is available * Remove the fixed version of TF in setup.py
2020-05-11 17:28:37 +02:00
parent cffbb3d8ed
commit 94b57bf796
3 changed files with 14 additions and 6 deletions
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -204,7 +204,10 @@ class GradientAccumulator(object):
        """Number of accumulated steps."""
        if self._accum_steps is None:
            self._accum_steps = tf.Variable(
-                tf.constant(0, dtype=tf.int64), trainable=False, synchronization=tf.VariableSynchronization.ON_READ,
+                tf.constant(0, dtype=tf.int64),
+                trainable=False,
+                synchronization=tf.VariableSynchronization.ON_READ,
+                aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
            )

        return self._accum_steps.value()
@@ -223,7 +226,10 @@ class GradientAccumulator(object):
            self._gradients.extend(
                [
                    tf.Variable(
-                        tf.zeros_like(gradient), trainable=False, synchronization=tf.VariableSynchronization.ON_READ,
+                        tf.zeros_like(gradient),
+                        trainable=False,
+                        synchronization=tf.VariableSynchronization.ON_READ,
+                        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
                    )
                    for gradient in gradients
                ]
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -56,9 +56,11 @@ class TFTrainingArguments(TrainingArguments):
                strategy = tf.distribute.experimental.TPUStrategy(tpu)
            elif len(gpus) == 0:
                strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
+            elif len(gpus) == 1:
+                strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
            elif len(gpus) > 1:
                # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
-                strategy = tf.distribute.MirroredStrategy(gpus)
+                strategy = tf.distribute.MirroredStrategy()
            else:
                raise ValueError("Cannot find the proper strategy please check your environment properties.")