From e03966e404394e0ccd4f62f2bdb9607b4a69993d Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Mon, 25 Apr 2022 20:10:51 +0100 Subject: [PATCH] TF: XLA stable softmax (#16892) Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- .../generation_tf_logits_process.py | 3 +- src/transformers/generation_tf_utils.py | 4 +- .../models/albert/modeling_tf_albert.py | 4 +- .../models/bart/modeling_tf_bart.py | 4 +- .../models/bert/modeling_tf_bert.py | 4 +- .../blenderbot/modeling_tf_blenderbot.py | 4 +- .../modeling_tf_blenderbot_small.py | 4 +- .../models/clip/modeling_tf_clip.py | 4 +- .../models/convbert/modeling_tf_convbert.py | 6 +-- .../models/ctrl/modeling_tf_ctrl.py | 4 +- .../models/deberta/modeling_tf_deberta.py | 4 +- .../deberta_v2/modeling_tf_deberta_v2.py | 4 +- .../distilbert/modeling_tf_distilbert.py | 4 +- .../models/electra/modeling_tf_electra.py | 4 +- .../models/flaubert/modeling_tf_flaubert.py | 4 +- .../models/funnel/modeling_tf_funnel.py | 4 +- .../models/gpt2/modeling_tf_gpt2.py | 4 +- .../models/gptj/modeling_tf_gptj.py | 4 +- .../models/hubert/modeling_tf_hubert.py | 4 +- .../models/layoutlm/modeling_tf_layoutlm.py | 4 +- .../models/led/modeling_tf_led.py | 8 ++-- .../longformer/modeling_tf_longformer.py | 6 +-- .../models/lxmert/modeling_tf_lxmert.py | 4 +- .../models/marian/modeling_tf_marian.py | 4 +- .../models/mbart/modeling_tf_mbart.py | 4 +- .../mobilebert/modeling_tf_mobilebert.py | 4 +- .../models/mpnet/modeling_tf_mpnet.py | 4 +- .../models/openai/modeling_tf_openai.py | 4 +- .../models/pegasus/modeling_tf_pegasus.py | 4 +- .../models/rembert/modeling_tf_rembert.py | 4 +- .../models/roberta/modeling_tf_roberta.py | 4 +- .../models/roformer/modeling_tf_roformer.py | 4 +- .../modeling_tf_speech_to_text.py | 4 +- src/transformers/models/t5/modeling_tf_t5.py | 4 +- .../models/tapas/modeling_tf_tapas.py | 6 +-- .../transfo_xl/modeling_tf_transfo_xl.py | 4 +- .../models/vit/modeling_tf_vit.py | 4 +- .../models/vit_mae/modeling_tf_vit_mae.py | 4 +- .../models/wav2vec2/modeling_tf_wav2vec2.py | 4 +- .../models/xlm/modeling_tf_xlm.py | 4 +- .../models/xlnet/modeling_tf_xlnet.py | 4 +- src/transformers/pipelines/fill_mask.py | 4 +- .../pipelines/image_classification.py | 3 +- .../zero_shot_image_classification.py | 4 +- src/transformers/tf_utils.py | 26 ++++++++++- ...tf_{{cookiecutter.lowercase_modelname}}.py | 8 ++-- tests/gpt2/test_modeling_tf_gpt2.py | 45 ++++++++++--------- tests/t5/test_modeling_tf_t5.py | 43 +++++++++--------- tests/test_modeling_tf_common.py | 36 +++++++++++++++ 49 files changed, 205 insertions(+), 137 deletions(-) diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py index b771211cea..0a5ac83182 100644 --- a/src/transformers/generation_tf_logits_process.py +++ b/src/transformers/generation_tf_logits_process.py @@ -19,6 +19,7 @@ from typing import List import numpy as np import tensorflow as tf +from .tf_utils import stable_softmax from .utils import add_start_docstrings from .utils.logging import get_logger @@ -166,7 +167,7 @@ class TFTopPLogitsWarper(TFLogitsWarper): topk_scores, topk_indices = tf.math.top_k(scores, scores.shape[-1]) mask_scores = tf.fill(scores.shape, self.filter_value) - cumulative_probs = tf.math.cumsum(tf.nn.softmax(topk_scores, axis=-1), axis=-1) + cumulative_probs = tf.math.cumsum(stable_softmax(topk_scores, axis=-1), axis=-1) score_mask = cumulative_probs < self.top_p # Also include the token that is higher than top_p (the first false = shift and insert a True on the left) diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index a5043c0ec5..fedc3fdf98 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -34,7 +34,7 @@ from .generation_tf_logits_process import ( TFTopKLogitsWarper, TFTopPLogitsWarper, ) -from .tf_utils import shape_list +from .tf_utils import shape_list, stable_softmax from .utils import ModelOutput, logging @@ -3060,7 +3060,7 @@ def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("In logits, sorted_indices, axis=-1, batch_dims=1 ) # expects logits to be of dim (batch_size, vocab_size) - cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1) + cumulative_probs = tf.math.cumsum(stable_softmax(sorted_logits, axis=-1), axis=-1) # Remove tokens with cumulative probability above the threshold (token with 0 are kept) sorted_indices_to_remove = cumulative_probs > top_p diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py index 93af948e21..753152f7a8 100644 --- a/src/transformers/models/albert/modeling_tf_albert.py +++ b/src/transformers/models/albert/modeling_tf_albert.py @@ -44,7 +44,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, ModelOutput, @@ -259,7 +259,7 @@ class TFAlbertAttention(tf.keras.layers.Layer): attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index 9cf3e04054..80b94bfe80 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -40,7 +40,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -244,7 +244,7 @@ class TFBartAttention(tf.keras.layers.Layer): attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) - attn_weights = tf.nn.softmax(attn_weights, axis=-1) + attn_weights = stable_softmax(attn_weights, axis=-1) if layer_head_mask is not None: # The tf.debugging asserts are not compliant with XLA then they diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index b64e203253..1b75d4dc66 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -49,7 +49,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS, @@ -322,7 +322,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer): attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py index 4225f8e14e..7fa910a3eb 100644 --- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py @@ -40,7 +40,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -245,7 +245,7 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) - attn_weights = tf.nn.softmax(attn_weights, axis=-1) + attn_weights = stable_softmax(attn_weights, axis=-1) if layer_head_mask is not None: # The tf.debugging asserts are not compliant with XLA then they diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py index 2d7fe2af61..612755882e 100644 --- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py @@ -39,7 +39,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -245,7 +245,7 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) - attn_weights = tf.nn.softmax(attn_weights, axis=-1) + attn_weights = stable_softmax(attn_weights, axis=-1) if layer_head_mask is not None: # The tf.debugging asserts are not compliant with XLA then they diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py index 366d0a9eb1..5d20962001 100644 --- a/src/transformers/models/clip/modeling_tf_clip.py +++ b/src/transformers/models/clip/modeling_tf_clip.py @@ -34,7 +34,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( ModelOutput, add_start_docstrings, @@ -333,7 +333,7 @@ class TFCLIPAttention(tf.keras.layers.Layer): attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. - _attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + _attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py index d460087343..d9d76dd4e2 100644 --- a/src/transformers/models/convbert/modeling_tf_convbert.py +++ b/src/transformers/models/convbert/modeling_tf_convbert.py @@ -42,7 +42,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, add_code_sample_docstrings, @@ -228,7 +228,7 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer): conv_kernel_layer = self.conv_kernel_layer(conv_attn_layer) conv_kernel_layer = tf.reshape(conv_kernel_layer, [-1, self.conv_kernel_size, 1]) - conv_kernel_layer = tf.nn.softmax(conv_kernel_layer, axis=1) + conv_kernel_layer = stable_softmax(conv_kernel_layer, axis=1) paddings = tf.constant( [ @@ -270,7 +270,7 @@ class TFConvBertSelfAttention(tf.keras.layers.Layer): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(attention_scores, axis=-1) + attention_probs = stable_softmax(attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py index 7727b8bb99..7fadc65cff 100644 --- a/src/transformers/models/ctrl/modeling_tf_ctrl.py +++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py @@ -31,7 +31,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_ctrl import CTRLConfig @@ -79,7 +79,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N attention_mask = tf.cast(attention_mask, dtype=scaled_attention_logits.dtype) scaled_attention_logits = scaled_attention_logits + attention_mask - attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) + attention_weights = stable_softmax(scaled_attention_logits, axis=-1) # Mask heads if we want to if head_mask is not None: diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py index 90ec5ca2c8..2b369eef5d 100644 --- a/src/transformers/models/deberta/modeling_tf_deberta.py +++ b/src/transformers/models/deberta/modeling_tf_deberta.py @@ -39,7 +39,7 @@ from ...modeling_tf_utils import ( get_initializer, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_deberta import DebertaConfig @@ -96,7 +96,7 @@ class TFDebertaXSoftmax(tf.keras.layers.Layer): rmask = tf.logical_not(tf.cast(mask, tf.bool)) output = tf.where(rmask, float("-inf"), inputs) - output = tf.nn.softmax(output, self.axis) + output = stable_softmax(output, self.axis) output = tf.where(rmask, 0.0, output) return output diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index 39cf57a146..5012aacf44 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -38,7 +38,7 @@ from ...modeling_tf_utils import ( get_initializer, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_deberta_v2 import DebertaV2Config @@ -97,7 +97,7 @@ class TFDebertaV2XSoftmax(tf.keras.layers.Layer): rmask = tf.logical_not(tf.cast(mask, tf.bool)) output = tf.where(rmask, float("-inf"), inputs) - output = tf.nn.softmax(output, self.axis) + output = stable_softmax(output, self.axis) output = tf.where(rmask, 0.0, output) return output diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py index 07aeee9e1f..737fc1e3c7 100644 --- a/src/transformers/models/distilbert/modeling_tf_distilbert.py +++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py @@ -43,7 +43,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, add_code_sample_docstrings, @@ -194,7 +194,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer): mask = tf.cast(mask, dtype=scores.dtype) scores = scores - 1e30 * (1.0 - mask) - weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) + weights = stable_softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) # Mask heads if we want to diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py index 5496396568..6483988a30 100644 --- a/src/transformers/models/electra/modeling_tf_electra.py +++ b/src/transformers/models/electra/modeling_tf_electra.py @@ -44,7 +44,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS, @@ -171,7 +171,7 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py index f751c0f225..d4bd3f53fd 100644 --- a/src/transformers/models/flaubert/modeling_tf_flaubert.py +++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py @@ -34,7 +34,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -361,7 +361,7 @@ class TFFlaubertMultiHeadAttention(tf.keras.layers.Layer): # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) mask = tf.cast(mask, dtype=scores.dtype) scores = scores - 1e30 * (1.0 - mask) - weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) + weights = stable_softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) # Mask heads if we want to diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py index d68b52c7bc..4e4f95d850 100644 --- a/src/transformers/models/funnel/modeling_tf_funnel.py +++ b/src/transformers/models/funnel/modeling_tf_funnel.py @@ -42,7 +42,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, ModelOutput, @@ -530,7 +530,7 @@ class TFFunnelRelMultiheadAttention(tf.keras.layers.Layer): attn_score = attn_score - (INF * (1 - attention_mask[:, None, None])) # attention probability - attn_prob = tf.nn.softmax(attn_score, axis=-1) + attn_prob = stable_softmax(attn_score, axis=-1) attn_prob = self.attention_dropout(attn_prob, training=training) # attention output, shape batch_size x seq_len x n_head x d_head diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py index 8a9544463f..3a11f46bdb 100644 --- a/src/transformers/models/gpt2/modeling_tf_gpt2.py +++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py @@ -40,7 +40,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( DUMMY_INPUTS, ModelOutput, @@ -129,7 +129,7 @@ class TFAttention(tf.keras.layers.Layer): attention_mask = tf.cast(attention_mask, dtype=w.dtype) w = w + attention_mask - w = tf.nn.softmax(w, axis=-1) + w = stable_softmax(w, axis=-1) w = self.attn_dropout(w, training=training) # Mask heads if we want to diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py index 43878b35a6..feaad22eff 100644 --- a/src/transformers/models/gptj/modeling_tf_gptj.py +++ b/src/transformers/models/gptj/modeling_tf_gptj.py @@ -43,7 +43,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import logging from .configuration_gptj import GPTJConfig @@ -191,7 +191,7 @@ class TFGPTJAttention(tf.keras.layers.Layer): # Apply the attention mask attn_weights = attn_weights + attention_mask - attn_weights = tf.nn.softmax(attn_weights, axis=-1) + attn_weights = stable_softmax(attn_weights, axis=-1) attn_weights = tf.cast(attn_weights, value.dtype) attn_weights = self.attn_dropout(attn_weights) diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index 4908aeb910..eb79815f1a 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -23,7 +23,7 @@ import tensorflow as tf from ...activations_tf import get_tf_activation from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput from ...modeling_tf_utils import TFPreTrainedModel, booleans_processing, get_initializer, keras_serializable -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...tokenization_utils_base import BatchEncoding from ...utils import ( ModelOutput, @@ -826,7 +826,7 @@ class TFHubertAttention(tf.keras.layers.Layer): attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) - attn_weights = tf.nn.softmax(attn_weights, axis=-1) + attn_weights = stable_softmax(attn_weights, axis=-1) if layer_head_mask is not None: # The tf.debugging asserts are not compliant with XLA then they diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py index 86b2fc5a38..b184cb352e 100644 --- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py @@ -39,7 +39,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_layoutlm import LayoutLMConfig @@ -280,7 +280,7 @@ class TFLayoutLMSelfAttention(tf.keras.layers.Layer): attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py index 8381d81afb..a882e32ec4 100644 --- a/src/transformers/models/led/modeling_tf_led.py +++ b/src/transformers/models/led/modeling_tf_led.py @@ -33,7 +33,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -271,7 +271,7 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer): ), lambda: attn_scores, ) - attn_probs = tf.nn.softmax(attn_scores, axis=-1) + attn_probs = stable_softmax(attn_scores, axis=-1) # softmax sometimes inserts NaN if all positions are masked, replace them with 0 # Make sure to create a mask with the proper shape: @@ -886,7 +886,7 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer): ) # compute global attn probs - global_attn_probs_float = tf.nn.softmax(global_attn_scores, axis=-1) + global_attn_probs_float = stable_softmax(global_attn_scores, axis=-1) # apply layer head masking if layer_head_mask is not None: @@ -1085,7 +1085,7 @@ class TFLEDDecoderAttention(tf.keras.layers.Layer): ) attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) - attn_weights = tf.nn.softmax(attn_weights, axis=-1) + attn_weights = stable_softmax(attn_weights, axis=-1) if layer_head_mask is not None: if tf.executing_eagerly(): diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index 850a8113f6..124fe2c06f 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -34,7 +34,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, ModelOutput, @@ -800,7 +800,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer): ), lambda: attn_scores, ) - attn_probs = tf.nn.softmax(attn_scores, axis=-1) + attn_probs = stable_softmax(attn_scores, axis=-1) # softmax sometimes inserts NaN if all positions are masked, replace them with 0 # Make sure to create a mask with the proper shape: @@ -1415,7 +1415,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer): ) # compute global attn probs - global_attn_probs_float = tf.nn.softmax(global_attn_scores, axis=-1) + global_attn_probs_float = stable_softmax(global_attn_scores, axis=-1) # apply layer head masking if layer_head_mask is not None: diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py index 2101b7cf1f..08c4dedce5 100644 --- a/src/transformers/models/lxmert/modeling_tf_lxmert.py +++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py @@ -22,6 +22,8 @@ from typing import Dict, Optional, Tuple import tensorflow as tf +from transformers.tf_utils import stable_softmax + from ...activations_tf import get_tf_activation from ...modeling_tf_utils import TFPreTrainedModel, get_initializer, keras_serializable, shape_list, unpack_inputs from ...utils import ( @@ -302,7 +304,7 @@ class TFLxmertAttention(tf.keras.layers.Layer): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(attention_scores, axis=-1) + attention_probs = stable_softmax(attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py index a696a5648f..04a24ac9f9 100644 --- a/src/transformers/models/marian/modeling_tf_marian.py +++ b/src/transformers/models/marian/modeling_tf_marian.py @@ -39,7 +39,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -284,7 +284,7 @@ class TFMarianAttention(tf.keras.layers.Layer): attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) - attn_weights = tf.nn.softmax(attn_weights, axis=-1) + attn_weights = stable_softmax(attn_weights, axis=-1) if layer_head_mask is not None: # The tf.debugging asserts are not compliant with XLA then they diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py index 1f427a8d10..b31ac1bd63 100644 --- a/src/transformers/models/mbart/modeling_tf_mbart.py +++ b/src/transformers/models/mbart/modeling_tf_mbart.py @@ -39,7 +39,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -246,7 +246,7 @@ class TFMBartAttention(tf.keras.layers.Layer): attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) - attn_weights = tf.nn.softmax(attn_weights, axis=-1) + attn_weights = stable_softmax(attn_weights, axis=-1) if layer_head_mask is not None: # The tf.debugging asserts are not compliant with XLA then they diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py index 5a357418c3..ee3e139c16 100644 --- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py @@ -46,7 +46,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, ModelOutput, @@ -278,7 +278,7 @@ class TFMobileBertSelfAttention(tf.keras.layers.Layer): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(attention_scores, axis=-1) + attention_probs = stable_softmax(attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index 0e8c61e340..41432a6fb5 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -42,7 +42,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, add_code_sample_docstrings, @@ -241,7 +241,7 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer): if attention_mask is not None: attention_scores = attention_scores + attention_mask - attention_probs = tf.nn.softmax(attention_scores, axis=-1) + attention_probs = stable_softmax(attention_scores, axis=-1) attention_probs = self.dropout(attention_probs, training=training) diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py index 40a94c1881..5215ad7c2f 100644 --- a/src/transformers/models/openai/modeling_tf_openai.py +++ b/src/transformers/models/openai/modeling_tf_openai.py @@ -35,7 +35,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -111,7 +111,7 @@ class TFAttention(tf.keras.layers.Layer): attention_mask = tf.cast(attention_mask, dtype=w.dtype) w = w + attention_mask - w = tf.nn.softmax(w, axis=-1) + w = stable_softmax(w, axis=-1) w = self.attn_dropout(w, training=training) # Mask heads if we want to diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py index d7eea1660a..be2539b3a9 100644 --- a/src/transformers/models/pegasus/modeling_tf_pegasus.py +++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py @@ -39,7 +39,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -285,7 +285,7 @@ class TFPegasusAttention(tf.keras.layers.Layer): attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) - attn_weights = tf.nn.softmax(attn_weights, axis=-1) + attn_weights = stable_softmax(attn_weights, axis=-1) if layer_head_mask is not None: # The tf.debugging asserts are not compliant with XLA then they diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py index f40ea6f6f1..c039f26350 100644 --- a/src/transformers/models/rembert/modeling_tf_rembert.py +++ b/src/transformers/models/rembert/modeling_tf_rembert.py @@ -45,7 +45,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS, @@ -241,7 +241,7 @@ class TFRemBertSelfAttention(tf.keras.layers.Layer): attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index b63d99a901..7c39b7334a 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -46,7 +46,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( DUMMY_INPUTS, MULTIPLE_CHOICE_DUMMY_INPUTS, @@ -290,7 +290,7 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py index 020824bb37..6bad979773 100644 --- a/src/transformers/models/roformer/modeling_tf_roformer.py +++ b/src/transformers/models/roformer/modeling_tf_roformer.py @@ -46,7 +46,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, add_code_sample_docstrings, @@ -262,7 +262,7 @@ class TFRoFormerSelfAttention(tf.keras.layers.Layer): attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index 75b237dc7a..8980636c3b 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -36,7 +36,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -348,7 +348,7 @@ class TFSpeech2TextAttention(tf.keras.layers.Layer): attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) - attn_weights = tf.nn.softmax(attn_weights, axis=-1) + attn_weights = stable_softmax(attn_weights, axis=-1) if layer_head_mask is not None: # The tf.debugging asserts are not compliant with XLA then they diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py index d7fd5b3014..2e48174a90 100644 --- a/src/transformers/models/t5/modeling_tf_t5.py +++ b/src/transformers/models/t5/modeling_tf_t5.py @@ -41,7 +41,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( DUMMY_INPUTS, DUMMY_MASK, @@ -398,7 +398,7 @@ class TFT5Attention(tf.keras.layers.Layer): position_bias = position_bias + mask # (batch_size, n_heads, query_length, key_length) scores += position_bias - weights = tf.nn.softmax(scores, axis=-1) # (batch_size, n_heads, query_length, key_length) + weights = stable_softmax(scores, axis=-1) # (batch_size, n_heads, query_length, key_length) weights = self.dropout(weights, training=training) # (batch_size, n_heads, query_length, key_length) # Mask heads if we want to diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py index b6a2f10d12..e91baaab8e 100644 --- a/src/transformers/models/tapas/modeling_tf_tapas.py +++ b/src/transformers/models/tapas/modeling_tf_tapas.py @@ -38,7 +38,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( ModelOutput, add_start_docstrings, @@ -346,7 +346,7 @@ class TFTapasSelfAttention(tf.keras.layers.Layer): attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -2216,7 +2216,7 @@ def _calculate_expected_result( aggregation_op_only_probs = gumbel_dist.sample() else: # [batch_size, num_aggregation_labels - 1] - aggregation_op_only_probs = tf.nn.softmax(logits_aggregation[:, 1:] / config.aggregation_temperature, axis=-1) + aggregation_op_only_probs = stable_softmax(logits_aggregation[:, 1:] / config.aggregation_temperature, axis=-1) all_results = tf.concat( [ tf.expand_dims(sum_result, axis=1), diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py index d1f7ad01dd..2975373883 100644 --- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py @@ -31,7 +31,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -236,7 +236,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): attn_score = attn_score * (1.0 - attn_mask_t) - 1e30 * attn_mask_t # [qlen x klen x bsz x n_head] - attn_prob = tf.nn.softmax(attn_score, axis=1) + attn_prob = stable_softmax(attn_score, axis=1) attn_prob = self.dropatt(attn_prob, training=training) # Mask heads if we want to diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py index e3c039ca83..9d478e968c 100644 --- a/src/transformers/models/vit/modeling_tf_vit.py +++ b/src/transformers/models/vit/modeling_tf_vit.py @@ -32,7 +32,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_vit import ViTConfig @@ -260,7 +260,7 @@ class TFViTSelfAttention(tf.keras.layers.Layer): attention_scores = tf.divide(attention_scores, dk) # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py index 40642ef4a6..f464b6665a 100644 --- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py @@ -38,7 +38,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import logging from .configuration_vit_mae import ViTMAEConfig @@ -407,7 +407,7 @@ class TFViTMAESelfAttention(tf.keras.layers.Layer): attention_scores = tf.divide(attention_scores, dk) # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index 98f922c3d4..9bbb908eb0 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -25,7 +25,7 @@ import tensorflow as tf from ...activations_tf import get_tf_activation from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput from ...modeling_tf_utils import TFPreTrainedModel, booleans_processing, get_initializer, keras_serializable -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...tokenization_utils_base import BatchEncoding from ...utils import ( ModelOutput, @@ -855,7 +855,7 @@ class TFWav2Vec2Attention(tf.keras.layers.Layer): attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) - attn_weights = tf.nn.softmax(attn_weights, axis=-1) + attn_weights = stable_softmax(attn_weights, axis=-1) if layer_head_mask is not None: # The tf.debugging asserts are not compliant with XLA then they diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py index 6919b10ef9..24d32f798f 100644 --- a/src/transformers/models/xlm/modeling_tf_xlm.py +++ b/src/transformers/models/xlm/modeling_tf_xlm.py @@ -44,7 +44,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, ModelOutput, @@ -187,7 +187,7 @@ class TFXLMMultiHeadAttention(tf.keras.layers.Layer): # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) mask = tf.cast(mask, dtype=scores.dtype) scores = scores - 1e30 * (1.0 - mask) - weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) + weights = stable_softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) # Mask heads if we want to diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py index d81924d345..f5a1cba3c8 100644 --- a/src/transformers/models/xlnet/modeling_tf_xlnet.py +++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py @@ -39,7 +39,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, ModelOutput, @@ -159,7 +159,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): attn_score = attn_score - 1e30 * attn_mask # attention probability - attn_prob = tf.nn.softmax(attn_score, axis=1) + attn_prob = stable_softmax(attn_score, axis=1) attn_prob = self.dropout(attn_prob, training=training) diff --git a/src/transformers/pipelines/fill_mask.py b/src/transformers/pipelines/fill_mask.py index 4a0c155576..517b457a65 100644 --- a/src/transformers/pipelines/fill_mask.py +++ b/src/transformers/pipelines/fill_mask.py @@ -9,6 +9,8 @@ from .base import PIPELINE_INIT_ARGS, GenericTensor, Pipeline, PipelineException if is_tf_available(): import tensorflow as tf + from ..tf_utils import stable_softmax + if is_torch_available(): import torch @@ -101,7 +103,7 @@ class FillMaskPipeline(Pipeline): outputs = outputs.numpy() logits = outputs[0, masked_index, :] - probs = tf.nn.softmax(logits, axis=-1) + probs = stable_softmax(logits, axis=-1) if target_ids is not None: probs = tf.gather_nd(tf.squeeze(probs, 0), target_ids.reshape(-1, 1)) probs = tf.expand_dims(probs, 0) diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py index d39a188168..e180aaf8cc 100644 --- a/src/transformers/pipelines/image_classification.py +++ b/src/transformers/pipelines/image_classification.py @@ -20,6 +20,7 @@ if is_tf_available(): import tensorflow as tf from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING + from ..tf_utils import stable_softmax if is_torch_available(): from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING @@ -103,7 +104,7 @@ class ImageClassificationPipeline(Pipeline): probs = model_outputs.logits.softmax(-1)[0] scores, ids = probs.topk(top_k) elif self.framework == "tf": - probs = tf.nn.softmax(model_outputs.logits, axis=-1)[0] + probs = stable_softmax(model_outputs.logits, axis=-1)[0] topk = tf.math.top_k(probs, k=top_k) scores, ids = topk.values.numpy(), topk.indices.numpy() else: diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index 951f11ef0d..0256f00a3f 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -22,6 +22,8 @@ if is_torch_available(): if is_tf_available(): import tensorflow as tf + from ..tf_utils import stable_softmax + logger = logging.get_logger(__name__) @@ -119,7 +121,7 @@ class ZeroShotImageClassificationPipeline(ChunkPipeline): scores = probs.tolist() else: logits = tf.concat([output["logits_per_image"] for output in model_outputs], axis=0) - probs = tf.nn.softmax(logits, axis=0) + probs = stable_softmax(logits, axis=0) scores = probs.numpy().tolist() result = [ diff --git a/src/transformers/tf_utils.py b/src/transformers/tf_utils.py index c0d076b31c..ce43a4537a 100644 --- a/src/transformers/tf_utils.py +++ b/src/transformers/tf_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Union +from typing import List, Optional, Union import numpy as np import tensorflow as tf @@ -44,3 +44,27 @@ def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> List[int]: static = tensor.shape.as_list() return [dynamic[i] if s is None else s for i, s in enumerate(static)] + + +def stable_softmax(logits: tf.Tensor, axis: Optional[int] = None, name: Optional[str] = None) -> tf.Tensor: + """ + Stable wrapper that returns the same output as `tf.nn.softmax`, but that works reliably with XLA on CPU. It is + meant as a workaround for the [following issue](https://github.com/tensorflow/tensorflow/issues/55682), and will be + removed after it gets fixed. The arguments and outputs are the same as `tf.nn.softmax`, and relies on the fact that + `softmax(x) = softmax(x + c)` (see https://ogunlao.github.io/2020/04/26/you_dont_really_know_softmax.html). + + Args: + logits (`tf.Tensor`): + Must be one of the following types: half, float32, float64. + axis (`int`, *optional*): + The dimension softmax would be performed on. The default is -1 which indicates the last dimension. + name (`str`, *optional*): + A name for the operation. + + Returns: + `tf.Tensor`: + A Tensor. Has the same type and shape as logits. + """ + # TODO: When the issue linked above gets sorted, add a check on TF version here and use the original function if + # it has the fix. After we drop the support for unfixed versions, remove this function. + return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 2d9914eebd..f5c40b27d6 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -53,7 +53,7 @@ from ...modeling_tf_utils import ( keras_serializable, unpack_inputs, ) -from ...tf_utils import shape_list +from ...tf_utils import shape_list, stable_softmax from ...utils import logging from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config @@ -244,7 +244,7 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) attention_scores = tf.add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + attention_probs = stable_softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -1665,8 +1665,8 @@ from ...modeling_tf_utils import ( TFWrappedEmbeddings, keras_serializable, unpack_inputs, -); from ...tf_utils import (shape_list, ) +from ...tf_utils import shape_list, stable_softmax from ...utils import logging from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config @@ -1855,7 +1855,7 @@ class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) - attn_weights = tf.nn.softmax(attn_weights, axis=-1) + attn_weights = stable_softmax(attn_weights, axis=-1) if layer_head_mask is not None: # The tf.debugging asserts are not compliant with XLA then they diff --git a/tests/gpt2/test_modeling_tf_gpt2.py b/tests/gpt2/test_modeling_tf_gpt2.py index 4f9b98d685..2092fb8feb 100644 --- a/tests/gpt2/test_modeling_tf_gpt2.py +++ b/tests/gpt2/test_modeling_tf_gpt2.py @@ -16,7 +16,7 @@ import unittest from transformers import GPT2Config, is_tf_available -from transformers.testing_utils import get_gpu_count, require_tf, slow +from transformers.testing_utils import require_tf, slow from ..test_configuration_common import ConfigTester from ..test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask @@ -536,8 +536,6 @@ class TFGPT2ModelLanguageGenerationTest(unittest.TestCase): self.assertListEqual(output_strings, expected_output_string) @slow - @unittest.skipIf(not get_gpu_count(), "XLA not reliable on CPU") - # TODO: remove the skip when the XLA CPU softmax issue gets sorted def test_lm_generate_gpt2_greedy_xla(self): # TODO (Joao): convert this to an example with a batch size>1 with different input lengths that works (and fix # the underlying problem) @@ -563,30 +561,33 @@ class TFGPT2ModelLanguageGenerationTest(unittest.TestCase): self.assertListEqual(output_strings, expected_output_strings) @slow - @unittest.skipIf(not get_gpu_count(), "XLA not reliable on CPU") - # TODO: remove the skip when the XLA CPU softmax issue gets sorted def test_lm_generate_gpt2_sample_xla(self): # NOTE: due to the small numerical differences that are natural when we compile to XLA, sampling the same # output out of the same seed is far from guaranteed. We can, however, confirm that the results are sensible # and that we can seed both versions. - model = TFGPT2LMHeadModel.from_pretrained("gpt2") - tokenizer = GPT2Tokenizer.from_pretrained("gpt2") - tokenizer.pad_token = tokenizer.eos_token - tokenizer.padding_side = "left" + # forces the generation to happen on CPU, to avoid GPU-related quirks + with tf.device(":/CPU:0"): + model = TFGPT2LMHeadModel.from_pretrained("gpt2") + tokenizer = GPT2Tokenizer.from_pretrained("gpt2") - sentence = ["The dog"] - expected_output_string = [ - "The dog must be well educated to do anything. If anything, this must be her best friend" - ] - expected_output_string_xla = ["The dog has been named in connection with the murder of a 20-year-old man in!"] - input_ids = tokenizer(sentence, return_tensors="tf", padding=True).input_ids + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" - output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0]) - output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) - self.assertListEqual(output_strings, expected_output_string) + sentence = ["The dog"] + expected_output_string = [ + "The dog owner asked why did our vet decide there needed to be extra ventilation inside because most puppies" + ] + expected_output_string_xla = [ + "The dog has been named in connection with the murder of a 20-year-old man in!" + ] + input_ids = tokenizer(sentence, return_tensors="tf", padding=True).input_ids - xla_generate = tf.function(model.generate, jit_compile=True) - output_ids = xla_generate(input_ids, do_sample=True, seed=[7, 0]) - output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) - self.assertListEqual(output_strings, expected_output_string_xla) + output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0]) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertListEqual(output_strings, expected_output_string) + + xla_generate = tf.function(model.generate, jit_compile=True) + output_ids = xla_generate(input_ids, do_sample=True, seed=[7, 0]) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertListEqual(output_strings, expected_output_string_xla) diff --git a/tests/t5/test_modeling_tf_t5.py b/tests/t5/test_modeling_tf_t5.py index c84339d5d2..d3008f017d 100644 --- a/tests/t5/test_modeling_tf_t5.py +++ b/tests/t5/test_modeling_tf_t5.py @@ -16,7 +16,7 @@ import unittest from transformers import T5Config, is_tf_available -from transformers.testing_utils import get_gpu_count, require_sentencepiece, require_tf, require_tokenizers, slow +from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow from transformers.utils import cached_property from ..test_configuration_common import ConfigTester @@ -481,8 +481,6 @@ class TFT5EncoderOnlyModelTest(TFModelTesterMixin, unittest.TestCase): @require_tokenizers class TFT5GenerationIntegrationTests(unittest.TestCase): @slow - @unittest.skipIf(not get_gpu_count(), "XLA not reliable on CPU") - # TODO: remove the skip when the XLA CPU softmax issue gets sorted def test_greedy_xla_generate_simple(self): model = TFT5ForConditionalGeneration.from_pretrained("t5-small") tokenizer = T5Tokenizer.from_pretrained("t5-small") @@ -534,30 +532,31 @@ class TFT5GenerationIntegrationTests(unittest.TestCase): self.assertListEqual(expected_output_string, output_strings) @slow - @unittest.skipIf(not get_gpu_count(), "XLA not reliable on CPU") - # TODO: remove the skip when the XLA CPU softmax issue gets sorted def test_sample_xla_generate_simple(self): # NOTE: due to the small numerical differences that are natural when we compile to XLA, sampling the same - # output out of the same seed is far from guaranteed (unlike this example). We can, however, confirm that the - # results are sensible and that we can seed both versions. - model = TFT5ForConditionalGeneration.from_pretrained("t5-small") - tokenizer = T5Tokenizer.from_pretrained("t5-small") + # output out of the same seed is far from guaranteed. We can, however, confirm that the results are sensible + # and that we can seed both versions. - sentence = "Translate English to German: I have two bananas" - input_ids = tokenizer(sentence, return_tensors="tf", padding=True).input_ids - expected_output_string = ["Ich habe 2 Bananen"] - expected_output_string_xla = ["Ich habe 2 Bananen"] + # forces the generation to happen on CPU, to avoid GPU-related quirks + with tf.device(":/CPU:0"): + model = TFT5ForConditionalGeneration.from_pretrained("t5-small") + tokenizer = T5Tokenizer.from_pretrained("t5-small") - # seed set -> deterministic sampling sequence -> deterministic generation - output_ids = model.generate(input_ids, do_sample=True, seed=[42, 0]) - output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) - self.assertListEqual(expected_output_string, output_strings) + sentence = "Translate English to German: I have two bananas" + input_ids = tokenizer(sentence, return_tensors="tf", padding=True).input_ids + expected_output_string = ["Ich habe zwei Bananen"] + expected_output_string_xla = ["Ich habe 2 Bananen"] - xla_generate = tf.function(model.generate, jit_compile=True) - # seed set -> deterministic sampling sequence -> deterministic generation - output_ids_xla = xla_generate(input_ids, do_sample=True, seed=[42, 0]) - output_strings_xla = tokenizer.batch_decode(output_ids_xla, skip_special_tokens=True) - self.assertListEqual(expected_output_string_xla, output_strings_xla) + # seed set -> deterministic sampling sequence -> deterministic generation + output_ids = model.generate(input_ids, do_sample=True, seed=[42, 0]) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertListEqual(expected_output_string, output_strings) + + xla_generate = tf.function(model.generate, jit_compile=True) + # seed set -> deterministic sampling sequence -> deterministic generation + output_ids_xla = xla_generate(input_ids, do_sample=True, seed=[42, 0]) + output_strings_xla = tokenizer.batch_decode(output_ids_xla, skip_special_tokens=True) + self.assertListEqual(expected_output_string_xla, output_strings_xla) @slow def test_sample_generate(self): diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 6fb0f845c6..0d38713e08 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -84,6 +84,7 @@ if is_tf_available(): TFSampleEncoderDecoderOutput, ) from transformers.modeling_tf_utils import unpack_inputs + from transformers.tf_utils import stable_softmax if _tf_gpu_memory_limit is not None: gpus = tf.config.list_physical_devices("GPU") @@ -1709,6 +1710,41 @@ class UtilsFunctionsTest(unittest.TestCase): self.assertFalse(output[3]) self.assertFalse(output[4]) + # Tests whether the stable softmax is stable on CPU, with and without XLA + def test_xla_stable_softmax(self): + large_penalty = -1e9 + n_tokens = 10 + batch_size = 8 + + def masked_softmax(x, boolean_mask): + numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty + masked_x = x + numerical_mask + return stable_softmax(masked_x) + + xla_masked_softmax = tf.function(masked_softmax, jit_compile=True) + xla_stable_softmax = tf.function(stable_softmax, jit_compile=True) + x = tf.random.normal((batch_size, n_tokens)) + + # Same outcome regardless of the boolean mask here + masked_tokens = random.randint(0, n_tokens) + boolean_mask = tf.convert_to_tensor([[1] * (n_tokens - masked_tokens) + [0] * masked_tokens], dtype=tf.int32) + + # We can randomly mask a random numerical input OUTSIDE XLA + numerical_mask = (1.0 - tf.cast(boolean_mask, dtype=tf.float32)) * large_penalty + masked_x = x + numerical_mask + xla_out = xla_stable_softmax(masked_x) + out = stable_softmax(masked_x) + assert tf.experimental.numpy.allclose(xla_out, out) + + # The stable softmax has the same output as the original softmax + unstable_out = tf.nn.softmax(masked_x) + assert tf.experimental.numpy.allclose(unstable_out, out) + + # We can randomly mask a random numerical input INSIDE XLA + xla_out = xla_masked_softmax(x, boolean_mask) + out = masked_softmax(x, boolean_mask) + assert tf.experimental.numpy.allclose(xla_out, out) + @require_tf @is_staging_test