From 94b57bf796022ad87f1ba8655e19575d98ff4ce6 Mon Sep 17 00:00:00 2001
From: Julien Plu <plu.julien@gmail.com>
Date: Mon, 11 May 2020 17:28:37 +0200
Subject: [PATCH] [TF 2.2 compat]  use
 tf.VariableAggregation.ONLY_FIRST_REPLICA (#4283)

* Fix the issue to properly run the accumulator with TF 2.2

* Apply style

* Fix training_args_tf for TF 2.2

* Fix the TF training args when only one GPU is available

* Remove the fixed version of TF in setup.py
---
 setup.py                             |  6 +++---
 src/transformers/optimization_tf.py  | 10 ++++++++--
 src/transformers/training_args_tf.py |  4 +++-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index f56a153f88..873d643ea6 100644
--- a/setup.py
+++ b/setup.py
@@ -67,8 +67,8 @@ extras = {}
 
 extras["mecab"] = ["mecab-python3"]
 extras["sklearn"] = ["scikit-learn"]
-extras["tf"] = ["tensorflow<=2.1.0"]
-extras["tf-cpu"] = ["tensorflow-cpu<=2.1.0"]
+extras["tf"] = ["tensorflow"]
+extras["tf-cpu"] = ["tensorflow-cpu"]
 extras["torch"] = ["torch"]
 
 extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"]
@@ -81,7 +81,7 @@ extras["quality"] = [
     "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort",
     "flake8",
 ]
-extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow<=2.1.0", "torch"]
+extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow", "torch"]
 
 setup(
     name="transformers",
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index 70b3feff65..6f4e789089 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -204,7 +204,10 @@ class GradientAccumulator(object):
         """Number of accumulated steps."""
         if self._accum_steps is None:
             self._accum_steps = tf.Variable(
-                tf.constant(0, dtype=tf.int64), trainable=False, synchronization=tf.VariableSynchronization.ON_READ,
+                tf.constant(0, dtype=tf.int64),
+                trainable=False,
+                synchronization=tf.VariableSynchronization.ON_READ,
+                aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
             )
 
         return self._accum_steps.value()
@@ -223,7 +226,10 @@ class GradientAccumulator(object):
             self._gradients.extend(
                 [
                     tf.Variable(
-                        tf.zeros_like(gradient), trainable=False, synchronization=tf.VariableSynchronization.ON_READ,
+                        tf.zeros_like(gradient),
+                        trainable=False,
+                        synchronization=tf.VariableSynchronization.ON_READ,
+                        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
                     )
                     for gradient in gradients
                 ]
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index c903a5579f..bb9bf9d18a 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -56,9 +56,11 @@ class TFTrainingArguments(TrainingArguments):
                 strategy = tf.distribute.experimental.TPUStrategy(tpu)
             elif len(gpus) == 0:
                 strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
+            elif len(gpus) == 1:
+                strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
             elif len(gpus) > 1:
                 # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
-                strategy = tf.distribute.MirroredStrategy(gpus)
+                strategy = tf.distribute.MirroredStrategy()
             else:
                 raise ValueError("Cannot find the proper strategy please check your environment properties.")