From 94b57bf796022ad87f1ba8655e19575d98ff4ce6 Mon Sep 17 00:00:00 2001 From: Julien Plu Date: Mon, 11 May 2020 17:28:37 +0200 Subject: [PATCH] [TF 2.2 compat] use tf.VariableAggregation.ONLY_FIRST_REPLICA (#4283) * Fix the issue to properly run the accumulator with TF 2.2 * Apply style * Fix training_args_tf for TF 2.2 * Fix the TF training args when only one GPU is available * Remove the fixed version of TF in setup.py --- setup.py | 6 +++--- src/transformers/optimization_tf.py | 10 ++++++++-- src/transformers/training_args_tf.py | 4 +++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index f56a153f88..873d643ea6 100644 --- a/setup.py +++ b/setup.py @@ -67,8 +67,8 @@ extras = {} extras["mecab"] = ["mecab-python3"] extras["sklearn"] = ["scikit-learn"] -extras["tf"] = ["tensorflow<=2.1.0"] -extras["tf-cpu"] = ["tensorflow-cpu<=2.1.0"] +extras["tf"] = ["tensorflow"] +extras["tf-cpu"] = ["tensorflow-cpu"] extras["torch"] = ["torch"] extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"] @@ -81,7 +81,7 @@ extras["quality"] = [ "isort @ git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort", "flake8", ] -extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow<=2.1.0", "torch"] +extras["dev"] = extras["testing"] + extras["quality"] + ["mecab-python3", "scikit-learn", "tensorflow", "torch"] setup( name="transformers", diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py index 70b3feff65..6f4e789089 100644 --- a/src/transformers/optimization_tf.py +++ b/src/transformers/optimization_tf.py @@ -204,7 +204,10 @@ class GradientAccumulator(object): """Number of accumulated steps.""" if self._accum_steps is None: self._accum_steps = tf.Variable( - tf.constant(0, dtype=tf.int64), trainable=False, synchronization=tf.VariableSynchronization.ON_READ, + tf.constant(0, dtype=tf.int64), + trainable=False, + synchronization=tf.VariableSynchronization.ON_READ, + aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, ) return self._accum_steps.value() @@ -223,7 +226,10 @@ class GradientAccumulator(object): self._gradients.extend( [ tf.Variable( - tf.zeros_like(gradient), trainable=False, synchronization=tf.VariableSynchronization.ON_READ, + tf.zeros_like(gradient), + trainable=False, + synchronization=tf.VariableSynchronization.ON_READ, + aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, ) for gradient in gradients ] diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index c903a5579f..bb9bf9d18a 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -56,9 +56,11 @@ class TFTrainingArguments(TrainingArguments): strategy = tf.distribute.experimental.TPUStrategy(tpu) elif len(gpus) == 0: strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") + elif len(gpus) == 1: + strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0") elif len(gpus) > 1: # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` - strategy = tf.distribute.MirroredStrategy(gpus) + strategy = tf.distribute.MirroredStrategy() else: raise ValueError("Cannot find the proper strategy please check your environment properties.")