TF: XLA beam search + most generation-compatible models are now also XLA-generate-compatible (#17857)

* working beam search 🎉 * XLA generation compatible with ALL classes * add xla generation slow test
2022-06-29 12:41:01 +01:00
parent b8142753f9
commit e6d27ca5c8
16 changed files with 356 additions and 301 deletions
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -20,6 +20,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import tensorflow as tf
 from tensorflow.compiler.tf2xla.python.xla import dynamic_update_slice
 from .generation_tf_logits_process import (
    TFForcedBOSTokenLogitsProcessor,
@@ -346,6 +347,7 @@ class TFGenerationMixin:
    """
    seed_generator = tf.random.Generator.from_non_deterministic_state()
    supports_xla_generation = True
    def prepare_inputs_for_generation(self, inputs, **kwargs):
        """
@@ -1511,6 +1513,12 @@ class TFGenerationMixin:
                f"length ({max_length})"
            )
        use_xla = not tf.executing_eagerly()
        if use_xla and not self.supports_xla_generation:
            raise ValueError(
                "The selected model does not support Graph mode nor XLA generation (e.g. from tf.function())"
            )
        # 2. Define model inputs
        input_ids = self._prepare_model_inputs(input_ids, bos_token_id)
        # inputs_ids now has to be defined and cannot be None anymore
@@ -1807,12 +1815,135 @@ class TFGenerationMixin:
        return model_kwargs
    def _update_model_kwargs_for_xla_generation(
-        self, outputs: ModelOutput, model_kwargs: Dict[str, Any], current_pos: tf.Tensor, max_length: int
+        self,
-    ) -> Dict[str, Any]:
+        model_outputs: ModelOutput,
-        raise NotImplementedError(
+        model_kwargs: Dict[str, Any],
-            f"{self.__class__} is not compileable with XLA at the moment. You should implement a "
+        cur_len: int,
-            "`_update_model_kwargs_for_xla_generation` in the respective modeling file for XLA-compatible generation."
+        max_length: int,
        batch_size: int,
        is_encoder_decoder: bool = False,
        batch_axis: int = 0,
    ):
        def _initialize_attention(model_kwargs, num_padding_values, is_encoder_decoder):
            """initializes the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`"""
            if is_encoder_decoder:
                # One 1 for decoder_start_token_id, 0s for the currently-unfilled locations in the past tensor,
                # 1s for the actual input_ids
                decoder_attention_mask = tf.concat(
                    [
                        tf.ones((batch_size, 1), dtype=tf.int32),
                        tf.zeros((batch_size, num_padding_values), dtype=tf.int32),
                        tf.ones((batch_size, 1), dtype=tf.int32),
                    ],
                    axis=1,
                )
                mask = {"decoder_attention_mask": decoder_attention_mask}
            else:
                attention_mask = model_kwargs.pop("attention_mask")
                # 0s for the currently-unfilled locations in the past tensor, 1s for the actual input_ids
                attention_mask = tf.concat(
                    [
                        attention_mask,
                        tf.zeros((batch_size, num_padding_values), dtype=attention_mask.dtype),
                        tf.ones((batch_size, 1), dtype=attention_mask.dtype),
                    ],
                    axis=1,
                )
                mask = {"attention_mask": attention_mask}
            return mask
        def _update_attention(model_kwargs, new_past_index, is_encoder_decoder):
            """updates the appropriate attention mask -- encoder-decoder models use `decoder_attention_mask`"""
            update_start = tf.constant([0, 1], dtype=tf.int32) * new_past_index
            if is_encoder_decoder:
                decoder_attention_mask = model_kwargs.pop("decoder_attention_mask")
                decoder_attention_mask_update_slice = tf.ones((batch_size, 1), dtype=decoder_attention_mask.dtype)
                decoder_attention_mask = dynamic_update_slice(
                    decoder_attention_mask, decoder_attention_mask_update_slice, update_start
                )
                mask = {"decoder_attention_mask": decoder_attention_mask}
            else:
                attention_mask = model_kwargs.pop("attention_mask")
                attention_mask_update_slice = tf.ones((batch_size, 1), dtype=attention_mask.dtype)
                attention_mask = dynamic_update_slice(attention_mask, attention_mask_update_slice, update_start)
                mask = {"attention_mask": attention_mask}
            return mask
        def _initialize_past(past, num_padding_values, batch_axis):
            """initialize past with zeros -- the structure depends on `batch_axis`"""
            if batch_axis == 0:
                padding_values = tf.scatter_nd(indices=[[2, 1]], updates=[num_padding_values], shape=(4, 2))
                new_past = ()
                for past_layer in past:
                    new_past_layer = list(past_layer)
                    for i in range(len(new_past_layer[:2])):
                        new_past_layer[i] = tf.pad(past_layer[i], padding_values)
                    new_past += (tuple(new_past_layer),)
            else:
                padding_values = tf.scatter_nd(indices=[[3, 1]], updates=[num_padding_values], shape=(5, 2))
                new_past = list(past)
                for i in range(len(past)):
                    new_past[i] = tf.pad(past[i], padding_values)
            return new_past
        def _update_past(past, new_past_index, batch_axis):
            if batch_axis == 0:
                slice_start_base = tf.constant([0, 0, 1, 0])
                new_past = ()
                for past_layer in past:
                    new_past_layer = list(past_layer)
                    for i in range(len(new_past_layer[:2])):
                        update_slice = past_layer[i][:, :, -1:]
                        # Write the last slice to the first open location in the padded past array
                        # and then truncate the last slice off the array
                        new_past_layer[i] = dynamic_update_slice(
                            past_layer[i][:, :, :-1], update_slice, slice_start_base * new_past_index
                        )
                    new_past += (tuple(new_past_layer),)
            else:
                slice_start_base = tf.constant([0, 0, 0, 1, 0])
                new_past = [None for _ in range(len(past))]
                for i in range(len(past)):
                    update_slice = past[i][:, :, :, -1:]
                    # Write the last slice to the first open location in the padded past array
                    # and then truncate the last slice off the array
                    new_past[i] = dynamic_update_slice(
                        past[i][:, :, :, :-1], update_slice, slice_start_base * new_past_index
                    )
            return new_past
        if "past_key_values" in model_outputs:
            past = model_outputs.past_key_values
        elif "mems" in model_outputs:
            past = model_outputs.mems
        elif "past_buckets_states" in model_outputs:
            past = model_outputs.past_buckets_states
        else:
            raise ValueError(
                f"No known past variable found in model outputs (model outputs keys: {list(model_outputs.keys())})"
            )
        is_past_initialized = model_kwargs.pop("past", None) is not None
        if not is_past_initialized:
            # The padded version of `past` has a length of `max_length - 1`, as `past` holds information relative to
            # previous autoregressive generation steps (step 0 has no past, step 1 has 1 past value, ..., the last step
            # has `max_length - 1` past values).
            num_padding_values = max_length - cur_len - 1
            mask = _initialize_attention(model_kwargs, num_padding_values, is_encoder_decoder)
            new_past = _initialize_past(past, num_padding_values, batch_axis)
        else:
            # The new index of past to be filled corresponds to the current length of the sequence, with two
            # subtractions: -1 because past holds information regarding previous generation steps (read comment above)
            # and -1 again because in an array the index is the length of the array minus 1.
            new_past_index = cur_len - 2
            mask = _update_attention(model_kwargs, new_past_index, is_encoder_decoder)
            new_past = _update_past(past, new_past_index, batch_axis)
        # sets the updated variables (mask and past)
        model_kwargs.update(mask)
        model_kwargs["past"] = tuple(new_past)
        return model_kwargs
    def _get_logits_warper(
        self,
@@ -1978,6 +2109,10 @@ class TFGenerationMixin:
            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
        )
        use_xla = not tf.executing_eagerly()
        # TODO (Joao): fix cache format or find programatic way to detect cache index
        # GPT2 and other models has a slightly different cache structure, with a different batch axis
        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
        cache_batch_axis = 1 if any([model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")]) else 0
        # some models, like XLNet, need more than the last token in the presence of past
        needs_full_input = "use_mems" in set(inspect.signature(self.prepare_inputs_for_generation).parameters.keys())
@@ -2010,29 +2145,29 @@ class TFGenerationMixin:
                input_ids = tf.expand_dims(generated[:, cur_len - 1], -1)
            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
            # forward pass to get next token logits
-            outputs = self(
+            model_outputs = self(
                **model_inputs,
                return_dict=True,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )
-            next_token_logits = outputs.logits[:, -1]
+            next_token_logits = model_outputs.logits[:, -1]
            # Store scores, attentions and hidden_states when required
            if not use_xla and return_dict_in_generate:
                if output_scores:
                    scores.append(next_token_logits)
                if output_attentions and self.config.is_encoder_decoder:
-                    decoder_attentions.append(outputs.decoder_attentions)
+                    decoder_attentions.append(model_outputs.decoder_attentions)
                elif output_attentions and not self.config.is_encoder_decoder:
-                    decoder_attentions.append(outputs.attentions)
+                    decoder_attentions.append(model_outputs.attentions)
                    if self.config.is_encoder_decoder:
-                        cross_attentions.append(outputs.cross_attentions)
+                        cross_attentions.append(model_outputs.cross_attentions)
                if output_hidden_states and self.config.is_encoder_decoder:
-                    decoder_hidden_states.append(outputs.decoder_hidden_states)
+                    decoder_hidden_states.append(model_outputs.decoder_hidden_states)
                elif output_hidden_states and self.config.is_encoder_decoder:
-                    decoder_hidden_states.append(outputs.hidden_states)
+                    decoder_hidden_states.append(model_outputs.hidden_states)
            # pre-process distribution
            next_tokens_scores = logits_processor(generated, next_token_logits, cur_len)
@@ -2054,10 +2189,18 @@ class TFGenerationMixin:
            # update model_kwargs
            if use_xla:
-                model_kwargs = self._update_model_kwargs_for_xla_generation(outputs, model_kwargs, cur_len, max_length)
+                model_kwargs = self._update_model_kwargs_for_xla_generation(
                    model_outputs=model_outputs,
                    model_kwargs=model_kwargs,
                    cur_len=cur_len,
                    max_length=max_length,
                    batch_size=batch_size,
                    is_encoder_decoder=self.config.is_encoder_decoder,
                    batch_axis=cache_batch_axis,
                )
            else:
                model_kwargs = self._update_model_kwargs_for_generation(
-                    outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+                    model_outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
                )
                # if we don't cache past key values we need the whole input
                if model_kwargs.get("past", None) is None:
@@ -2236,6 +2379,10 @@ class TFGenerationMixin:
            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
        )
        use_xla = not tf.executing_eagerly()
        # TODO (Joao): fix cache format or find programatic way to detect cache index
        # GPT2 and other models has a slightly different cache structure, with a different batch axis
        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
        cache_batch_axis = 1 if any([model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")]) else 0
        # some models, like XLNet, need more than the last token in the presence of past
        needs_full_input = "use_mems" in set(inspect.signature(self.prepare_inputs_for_generation).parameters.keys())
@@ -2264,29 +2411,29 @@ class TFGenerationMixin:
                input_ids = tf.expand_dims(generated[:, cur_len - 1], -1)
            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
            # forward pass to get next token logits
-            outputs = self(
+            model_outputs = self(
                **model_inputs,
                return_dict=True,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )
-            next_token_logits = outputs.logits[:, -1]
+            next_token_logits = model_outputs.logits[:, -1]
            # Store scores, attentions and hidden_states when required
            if not use_xla and return_dict_in_generate:
                if output_scores:
                    scores.append(next_token_logits)
                if output_attentions and self.config.is_encoder_decoder:
-                    decoder_attentions.append(outputs.decoder_attentions)
+                    decoder_attentions.append(model_outputs.decoder_attentions)
                elif output_attentions and not self.config.is_encoder_decoder:
-                    decoder_attentions.append(outputs.attentions)
+                    decoder_attentions.append(model_outputs.attentions)
                    if self.config.is_encoder_decoder:
-                        cross_attentions.append(outputs.cross_attentions)
+                        cross_attentions.append(model_outputs.cross_attentions)
                if output_hidden_states and self.config.is_encoder_decoder:
-                    decoder_hidden_states.append(outputs.decoder_hidden_states)
+                    decoder_hidden_states.append(model_outputs.decoder_hidden_states)
                elif output_hidden_states and self.config.is_encoder_decoder:
-                    decoder_hidden_states.append(outputs.hidden_states)
+                    decoder_hidden_states.append(model_outputs.hidden_states)
            # pre-process distribution
            next_tokens_scores = logits_processor(generated, next_token_logits, cur_len)
@@ -2318,10 +2465,18 @@ class TFGenerationMixin:
            # update model_kwargs
            if use_xla:
-                model_kwargs = self._update_model_kwargs_for_xla_generation(outputs, model_kwargs, cur_len, max_length)
+                model_kwargs = self._update_model_kwargs_for_xla_generation(
                    model_outputs=model_outputs,
                    model_kwargs=model_kwargs,
                    cur_len=cur_len,
                    max_length=max_length,
                    batch_size=batch_size,
                    is_encoder_decoder=self.config.is_encoder_decoder,
                    batch_axis=cache_batch_axis,
                )
            else:
                model_kwargs = self._update_model_kwargs_for_generation(
-                    outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+                    model_outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
                )
                # if we don't cache past key values we need the whole input
                if model_kwargs.get("past", None) is None:
@@ -2484,9 +2639,6 @@ class TFGenerationMixin:
        def flatten_beam_dim(tensor, batch_axis=0):
            """Flattens the first two dimensions of a non-scalar array."""
            # ignore scalars (e.g. cache index)
            if tf.rank(tensor) == 0:
                return tensor
            return tf.reshape(
                tensor,
                tensor.shape[:batch_axis]
@@ -2496,9 +2648,6 @@ class TFGenerationMixin:
        def unflatten_beam_dim(tensor, batch_size, num_beams, batch_axis=0):
            """Unflattens the first, flat batch*beam dimension of a non-scalar array."""
            # ignore scalars (e.g. cache index)
            if tf.rank(tensor) == 0:
                return tensor
            return tf.reshape(
                tensor, tensor.shape[:batch_axis] + [batch_size, num_beams] + tensor.shape[batch_axis + 1 :]
            )
@@ -2507,23 +2656,15 @@ class TFGenerationMixin:
            """Gathers the beam slices indexed by beam_indices into new beam array."""
            def gather_fn(tensor):
                # ignore scalars (e.g. cache index)
                if tf.rank(tensor) == 0:
                    return tensor
                else:
                if batch_axis > 0:
                    # pushes all dimentions before the batch to the end, so we get (batch, beam_id, ...)
-                        perm = [axis for axis in range(tf.rank(tensor)) if axis >= batch_axis] + list(
+                    perm = tf.concat((tf.range(tf.rank(tensor))[batch_axis:], tf.range(batch_axis)), axis=0)
                            range(batch_axis)
                        )
                    tensor = tf.transpose(tensor, perm=perm)
                gathered_tensor = tf.gather(params=tensor, indices=beam_indices, axis=1, batch_dims=1)
                if batch_axis > 0:
                    # transposes back to the original dimensions
-                        perm = [axis for axis in range(tf.rank(tensor)) if axis >= batch_axis] + list(
+                    perm = tf.concat((tf.range(tf.rank(tensor))[batch_axis:], tf.range(batch_axis)), axis=0)
                            range(batch_axis)
                        )
                    perm = tf.math.invert_permutation(perm)
                    gathered_tensor = tf.transpose(gathered_tensor, perm=perm)
@@ -2734,7 +2875,7 @@ class TFGenerationMixin:
            # - add length penalty
            # - make sure no scores can be added anymore if beam is full
            # - make sure still running sequences cannot be chosen as finalized beam
-            topk_log_probs = topk_log_probs / (cur_len**length_penalty)
+            topk_log_probs = topk_log_probs / (tf.cast(cur_len, dtype=tf.float32) ** length_penalty)
            beams_in_batch_are_full = (
                tf.broadcast_to(
                    tf.math.reduce_all(is_sent_finished, axis=-1, keepdims=True), did_topk_just_finished.shape
@@ -2772,7 +2913,13 @@ class TFGenerationMixin:
            if use_xla:
                next_model_kwargs = self._update_model_kwargs_for_xla_generation(
-                    model_outputs, model_kwargs, cur_len, max_length
+                    model_outputs=model_outputs,
                    model_kwargs=model_kwargs,
                    cur_len=cur_len,
                    max_length=max_length,
                    batch_size=(batch_size * num_beams),
                    is_encoder_decoder=self.config.is_encoder_decoder,
                    batch_axis=cache_batch_axis,
                )
            else:
                next_model_kwargs = self._update_model_kwargs_for_generation(
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -20,7 +20,6 @@ from typing import Optional, Tuple, Union
 import numpy as np
 import tensorflow as tf
 from tensorflow.compiler.tf2xla.python.xla import dynamic_update_slice
 from ...activations_tf import get_tf_activation
 from ...modeling_tf_outputs import (
@@ -1434,69 +1433,6 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel, TFCausalLanguageMode
            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
        }
    def _update_model_kwargs_for_xla_generation(self, outputs, model_kwargs, current_pos, max_length):
        # TODO(Pvp, Joao, Matt) - this function can be cleaned a bit and refactored
        # quite some duplicated code patterns it seems
        past = outputs.past_key_values
        is_past_initialized = model_kwargs.pop("past", None) is not None
        decoder_attention_mask = model_kwargs.pop("decoder_attention_mask", None)
        batch_size = past[0][0].shape[0]
        if not is_past_initialized:
            # past[0][0].shape[2] is seq_length of prompt
            # The padded version of `past` requires only `max_length - 1` steps along the time dimension.
            num_padding_values = max_length - past[0][0].shape[2] - 1
            # prepare the padding tensor for `tf.pad`.
            # `shape=(4, 2)` because each tensor element in `past` has `rank=4`.
            # `indices=[[2, 1]]` means the time dimension (dim 2) needs **right**-padding (`1` means padding afterward).
            padding_values = tf.scatter_nd(indices=[[2, 1]], updates=[num_padding_values], shape=(4, 2))
            new_past = ()
            for past_layer in past:
                new_past_layer = list(past_layer)
                for i in range(len(new_past_layer[:2])):
                    new_past_layer[i] = tf.pad(past_layer[i], padding_values)
                new_past += (tuple(new_past_layer),)
            # 1 one for decoder_start_token_id, Zeros for the currently-unfilled locations in the past tensor,
            # ones for the actual input_ids
            decoder_attention_mask = tf.concat(
                [
                    tf.ones((batch_size, 1), dtype=tf.int32),
                    tf.zeros((batch_size, num_padding_values), dtype=tf.int32),
                    tf.ones((batch_size, 1), dtype=tf.int32),
                ],
                axis=1,
            )
        else:
            slice_start_base = tf.constant([0, 0, 1, 0])
            decoder_attention_mask_update_slice = tf.ones((batch_size, 1), dtype=decoder_attention_mask.dtype)
            # correct 5 here
            new_past_index = current_pos - 1
            new_past = ()
            for past_layer in past:
                new_past_layer = list(past_layer)
                for i in range(len(new_past_layer[:2])):
                    update_slice = past_layer[i][:, :, -1:]
                    # Write the last slice to the first open location in the padded past array
                    # and then truncate the last slice off the array
                    new_past_layer[i] = dynamic_update_slice(
                        past_layer[i][:, :, :-1], update_slice, slice_start_base * new_past_index
                    )
                new_past += (tuple(new_past_layer),)
            update_start = tf.constant([0, 1], dtype=tf.int32) * new_past_index
            decoder_attention_mask = dynamic_update_slice(
                decoder_attention_mask, decoder_attention_mask_update_slice, update_start
            )
        # set `decoder_attention_mask` and `past`
        model_kwargs["decoder_attention_mask"] = decoder_attention_mask
        model_kwargs["past"] = new_past
        return model_kwargs
    def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -571,6 +571,8 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = config.vocab_size
        # CTRL has numerical issues in XLA generate
        self.supports_xla_generation = False
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
@@ -613,6 +615,8 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
        self.transformer = TFCTRLMainLayer(config, name="transformer")
        self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head")
        # CTRL has numerical issues in XLA generate
        self.supports_xla_generation = False
    def get_lm_head(self):
        return self.lm_head
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -761,6 +761,8 @@ class TFFlaubertWithLMHeadModel(TFFlaubertPreTrainedModel):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFFlaubertMainLayer(config, name="transformer")
        self.pred_layer = TFFlaubertPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
        # Flaubert does not have past caching features
        self.supports_xla_generation = False
    def get_lm_head(self):
        return self.pred_layer
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -20,7 +20,6 @@ from typing import List, Optional, Tuple, Union
 import numpy as np
 import tensorflow as tf
 from tensorflow.compiler.tf2xla.python.xla import dynamic_update_slice
 from ...activations_tf import get_tf_activation
 from ...modeling_tf_outputs import (
@@ -838,63 +837,6 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
            "token_type_ids": token_type_ids,
        }
    def _update_model_kwargs_for_xla_generation(self, outputs, model_kwargs, current_pos, max_length):
        # TODO(Pvp, Joao, Matt) - this function can be cleaned a bit and refactored
        # quite some duplicated code patterns it seems
        # also the `attention_mask` is currently used in a somewhat hacky to
        # correctly influence the `past_key_values` - not sure if this is the way to go
        # Let's keep that for a future PR.
        past = outputs.past_key_values
        is_past_initialized = model_kwargs.pop("past", None) is not None
        attention_mask = model_kwargs.pop("attention_mask")
        batch_size = attention_mask.shape[0]
        if not is_past_initialized:
            # past[0].shape[3] is seq_length of prompt
            num_padding_values = max_length - past[0].shape[3] - 1
            padding_values = np.zeros((5, 2), dtype=np.int32)
            padding_values[3, 1] = num_padding_values
            padding_values = tf.constant(padding_values)
            new_past = list(past)
            for i in range(len(past)):
                new_past[i] = tf.pad(past[i], padding_values)
            # Zeros for the currently-unfilled locations in the past tensor, ones for the actual input_ids
            attention_mask = tf.concat(
                [
                    attention_mask,
                    tf.zeros((batch_size, num_padding_values), dtype=attention_mask.dtype),
                    tf.ones((batch_size, 1), dtype=attention_mask.dtype),
                ],
                axis=1,
            )
        else:
            new_past = [None for _ in range(len(past))]
            slice_start_base = tf.constant([0, 0, 0, 1, 0])
            attention_mask_update_slice = tf.ones((batch_size, 1), dtype=attention_mask.dtype)
            # -1 because current_pos has already been incremented before this function
            # -1 again because last index = len - 1
            new_past_index = current_pos - 2
            for i in range(len(past)):
                update_slice = past[i][:, :, :, -1:]
                # Write the last slice to the first open location in the padded past array
                # and then truncate the last slice off the array
                new_past[i] = dynamic_update_slice(
                    past[i][:, :, :, :-1], update_slice, slice_start_base * new_past_index
                )
            update_start = tf.constant([0, 1], dtype=tf.int32) * new_past_index
            attention_mask = dynamic_update_slice(attention_mask, attention_mask_update_slice, update_start)
        # set `attention_mask` and `past`
        model_kwargs["attention_mask"] = attention_mask
        model_kwargs["past"] = tuple(new_past)
        return model_kwargs
    @unpack_inputs
    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@@ -722,6 +722,8 @@ class TFGPTJForCausalLM(TFGPTJPreTrainedModel, TFCausalLanguageModelingLoss):
        self.lm_head = tf.keras.layers.Dense(
            config.vocab_size, kernel_initializer=get_initializer(config.initializer_range), name="lm_head"
        )
        # TODO (Joao): investigate why GPTJ has numerical issues in XLA generate
        self.supports_xla_generation = False
    def get_output_embeddings(self):
        return self.lm_head
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -2334,6 +2334,8 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
        self.final_logits_bias = self.add_weight(
            name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
        )
        # TODO (Joao): investigate why LED has numerical issues in XLA generate
        self.supports_xla_generation = False
    def get_decoder(self):
        return self.led.decoder
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -556,6 +556,8 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelin
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
        # OpenAIGPT does not have past caching features
        self.supports_xla_generation = False
    def get_output_embeddings(self):
        return self.get_input_embeddings()
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -1332,6 +1332,8 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus
        super().__init__(config)
        self.model = TFSpeech2TextMainLayer(config, name="model")
        self.lm_head = tf.keras.layers.Dense(self.config.vocab_size, use_bias=False, name="lm_head")
        # TODO (Joao): investigate why Speech2Text has numerical issues in XLA generate
        self.supports_xla_generation = False
    def get_encoder(self):
        return self.model.encoder
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -23,7 +23,6 @@ from typing import Optional, Tuple, Union
 import numpy as np
 import tensorflow as tf
 from tensorflow.compiler.tf2xla.python.xla import dynamic_update_slice
 from ...activations_tf import get_tf_activation
 from ...modeling_tf_outputs import (
@@ -1501,65 +1500,6 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
            "use_cache": use_cache,
        }
    def _update_model_kwargs_for_xla_generation(self, outputs, model_kwargs, current_pos, max_length):
        # TODO(Pvp, Joao, Matt) - this function can be cleaned a bit and refactored
        # quite some duplicated code patterns it seems
        past = outputs.past_key_values
        is_past_initialized = model_kwargs.pop("past", None) is not None
        decoder_attention_mask = model_kwargs.pop("decoder_attention_mask", None)
        batch_size = past[0][0].shape[0]
        if not is_past_initialized:
            # past[0].shape[2] is seq_length of prompt
            num_padding_values = max_length - past[0][0].shape[2] - 1
            padding_values = tf.scatter_nd(indices=[[2, 1]], updates=[num_padding_values], shape=(4, 2))
            new_past = ()
            for past_layer in past:
                new_past_layer = list(past_layer)
                for i in range(len(new_past_layer[:2])):
                    new_past_layer[i] = tf.pad(past_layer[i], padding_values)
                new_past += (tuple(new_past_layer),)
            # 1 one for decoder_start_token_id, Zeros for the currently-unfilled locations in the past tensor,
            # ones for the actual input_ids
            decoder_attention_mask = tf.concat(
                [
                    tf.ones((batch_size, 1), dtype=tf.int32),
                    tf.zeros((batch_size, num_padding_values), dtype=tf.int32),
                    tf.ones((batch_size, 1), dtype=tf.int32),
                ],
                axis=1,
            )
        else:
            slice_start_base = tf.constant([0, 0, 1, 0])
            decoder_attention_mask_update_slice = tf.ones((batch_size, 1), dtype=decoder_attention_mask.dtype)
            # correct 5 here
            new_past_index = current_pos - 1
            new_past = ()
            for past_layer in past:
                new_past_layer = list(past_layer)
                for i in range(len(new_past_layer[:2])):
                    update_slice = past_layer[i][:, :, -1:]
                    # Write the last slice to the first open location in the padded past array
                    # and then truncate the last slice off the array
                    new_past_layer[i] = dynamic_update_slice(
                        past_layer[i][:, :, :-1], update_slice, slice_start_base * new_past_index
                    )
                new_past += (tuple(new_past_layer),)
            update_start = tf.constant([0, 1], dtype=tf.int32) * new_past_index
            decoder_attention_mask = dynamic_update_slice(
                decoder_attention_mask, decoder_attention_mask_update_slice, update_start
            )
        # set `decoder_attention_mask` and `past`
        model_kwargs["decoder_attention_mask"] = decoder_attention_mask
        model_kwargs["past"] = new_past
        return model_kwargs
    def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
        return self._shift_right(labels)
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -797,6 +797,8 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFXLMMainLayer(config, name="transformer")
        self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
        # XLM does not have past caching features
        self.supports_xla_generation = False
    def get_lm_head(self):
        return self.pred_layer
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -1192,6 +1192,8 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFXLNetMainLayer(config, name="transformer")
        self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss")
        # generate fails to convert to a graph with XLNet
        self.supports_xla_generation = False
    def get_lm_head(self):
        return self.lm_loss
--- a/tests/models/bart/test_modeling_tf_bart.py
+++ b/tests/models/bart/test_modeling_tf_bart.py
@@ -152,23 +152,6 @@ class TFBartModelTester:
        # test that outputs are equal for slice
        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
    def create_and_check_bart_xla_generate_fast(self, config, input_ids, *args):
        config.eos_token_id = None  # Generate until max length
        config.max_length = 10
        config.do_sample = False
        config.num_beams = 1
        model = TFBartForConditionalGeneration(config=config)
        # make sure there are no pad tokens in prompt
        input_ids = tf.where(input_ids != config.pad_token_id, input_ids, config.pad_token_id - 1)
        generated = model.generate(input_ids)
        generate_xla = tf.function(model.generate, jit_compile=True)
        generated_xla = generate_xla(input_ids)
        self.parent.assertListEqual(generated.numpy().tolist(), generated_xla.numpy().tolist())
 def prepare_bart_inputs_dict(
    config,
@@ -310,10 +293,6 @@ class TFBartModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestC
                                models_equal = False
                    self.assertTrue(models_equal)
    def test_bart_model_xla_generate_fast(self):
        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
        self.model_tester.create_and_check_bart_xla_generate_fast(config, inputs["input_ids"])
    def test_saved_model_creation(self):
        # This test is too long (>30sec) and makes fail the CI
        pass
@@ -703,10 +682,8 @@ class FasterTFBartModelIntegrationTests(unittest.TestCase):
        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
        assert result == EXPECTED
-    def test_xsum_1_1_xla_greedy_generation(self):
+    def test_xsum_1_1_xla_generation(self):
-        # TODO (Joao): this is temporary test, while XLA beam search is not operational. Move the XLA==non-XLA
+        # same test as above, but with `no_repeat_ngram_size=0` (not compatible with XLA) and XLA comparison enabled
        # comparisons to the other tests after enabling XLA beam search.
        # Note -- `no_repeat_ngram_size` has to be disabled, since it is not compatible with XLA
        model = self.xsum_1_1_model
        assert model.model.decoder.embed_tokens._layer == model.model.shared
        ARTICLE = (
@@ -748,15 +725,16 @@ class FasterTFBartModelIntegrationTests(unittest.TestCase):
        )
        EXPECTED = (
            " The International Criminal Court (ICC) has announced that it is to be investigated by the International"
-            " Criminal Court (ICC) over claims that the Palestinian genocide."
+            " Criminal Court (ICC) over allegations of war crimes."
        )
        dct = self.tok(ARTICLE, return_tensors="tf")
-        generated_ids = model.generate(**dct, num_beams=1, no_repeat_ngram_size=0)
+        generated_ids = model.generate(**dct, num_beams=4, no_repeat_ngram_size=0)
        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
        assert result == EXPECTED
        xla_generate = tf.function(model.generate, jit_compile=True)
-        generated_ids = xla_generate(**dct, num_beams=1, no_repeat_ngram_size=0)
+        generated_ids = xla_generate(**dct, num_beams=4, no_repeat_ngram_size=0)
        result = self.tok.batch_decode(generated_ids, skip_special_tokens=True)[0]
        assert result == EXPECTED
--- a/tests/models/gpt2/test_modeling_tf_gpt2.py
+++ b/tests/models/gpt2/test_modeling_tf_gpt2.py
@@ -294,21 +294,6 @@ class TFGPT2ModelTester:
        result = model(inputs)
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
    def create_and_check_gpt2_xla_generate_fast(self, config, input_ids, *args):
        config.eos_token_id = None  # Generate until max length
        config.max_length = 10
        model = TFGPT2LMHeadModel(config=config)
        # make sure there are no pad tokens in prompt
        input_ids = tf.where(input_ids != config.pad_token_id, input_ids, config.pad_token_id - 1)
        generated = model.generate(input_ids)
        generate_xla = tf.function(model.generate, jit_compile=True)
        generated_xla = generate_xla(input_ids)
        self.parent.assertListEqual(generated.numpy().tolist(), generated_xla.numpy().tolist())
    def create_and_check_gpt2_double_head(
        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
    ):
@@ -408,10 +393,6 @@ class TFGPT2ModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestC
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_gpt2_lm_head(*config_and_inputs)
    def test_gpt2_xla_generate_fast(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_gpt2_xla_generate_fast(*config_and_inputs)
    def test_gpt2_double_head(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
@@ -653,3 +634,27 @@ class TFGPT2ModelLanguageGenerationTest(unittest.TestCase):
            output_ids = xla_generate(**input_ids, do_sample=True, seed=[7, 0])
            output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
            self.assertListEqual(output_strings, expected_output_string_xla)
    @slow
    def test_lm_generate_gpt2_beam_search_xla(self):
        model = TFGPT2LMHeadModel.from_pretrained("gpt2")
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "left"
        sentences = ["The dog", "The flying machine"]
        expected_output_strings = [
            "The dog was found in the backyard of a home in the 6500 block of South Main Street",
            "The flying machine is a very powerful machine, but it's not a very powerful machine. It's",
        ]
        input_ids = tokenizer(sentences, return_tensors="tf", padding=True)
        output_ids = model.generate(**input_ids, do_sample=False, num_beams=2)
        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        self.assertListEqual(output_strings, expected_output_strings)
        xla_generate = tf.function(model.generate, jit_compile=True)
        output_ids = xla_generate(**input_ids, do_sample=False, num_beams=2)
        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        self.assertListEqual(output_strings, expected_output_strings)
--- a/tests/models/t5/test_modeling_tf_t5.py
+++ b/tests/models/t5/test_modeling_tf_t5.py
@@ -227,23 +227,6 @@ class TFT5ModelTester:
        # test that outputs are equal for slice
        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
    def create_and_check_t5_xla_generate_fast(self, config, input_ids, *args):
        config.eos_token_id = None  # Generate until max length
        config.max_length = 10
        config.do_sample = False
        config.num_beams = 1
        model = TFT5ForConditionalGeneration(config=config)
        # make sure there are no pad tokens in prompt
        input_ids = tf.where(input_ids != config.pad_token_id, input_ids, config.pad_token_id + 5)
        generated = model.generate(input_ids)
        generate_xla = tf.function(model.generate, jit_compile=True)
        generated_xla = generate_xla(input_ids)
        self.parent.assertListEqual(generated.numpy().tolist(), generated_xla.numpy().tolist())
    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        (config, input_ids, input_mask, token_labels) = config_and_inputs
@@ -304,10 +287,6 @@ class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):
        self.model_tester.create_and_check_t5_decoder_model_past_large_inputs(*config_and_inputs)
    def test_t5_model_xla_generate_fast(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_t5_xla_generate_fast(*config_and_inputs)
    def test_model_common_attributes(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -594,6 +573,43 @@ class TFT5GenerationIntegrationTests(unittest.TestCase):
        self.assertListEqual(expected_output_string, output_strings)
    @slow
    def test_beam_search_xla_generate_simple(self):
        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
        tokenizer = T5Tokenizer.from_pretrained("t5-small")
        # tests XLA with task specific arguments
        task_specific_config = getattr(model.config, "task_specific_params", {})
        translation_config = task_specific_config.get("translation_en_to_fr", {})
        model.config.update(translation_config)
        # two examples with different lengths to confirm that attention masks are operational in XLA
        sentences = [
            model.config.prefix + "Today is a beautiful day.",
            model.config.prefix + "I have four cats, three dogs, two birds, and a horse.",
        ]
        input_ids = tokenizer(sentences, return_tensors="tf", padding=True).input_ids
        # xla_generate = tf.function(model.generate, jit_compile=True)
        xla_generate = tf.function(model.generate)
        # TODO (joao): there is something not quite right with XLA T5 -- as we increase `max_length` the two outputs
        # drift appart, where the XLA version clearly degrades its quality. XLA-related variables look fine (they are
        # being padded and filled in the right places). This also happens in other generation modes. Investigate.
        output_ids = model.generate(input_ids, num_beams=2, max_length=9)
        output_ids_xla = xla_generate(input_ids, num_beams=2, max_length=9)
        output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        output_strings_xla = tokenizer.batch_decode(output_ids_xla, skip_special_tokens=True)
        expected_output_string = [
            "Aujourd'hui est une belle journée.",
            "J'ai quatre chats,",
        ]
        self.assertListEqual(expected_output_string, output_strings)
        self.assertListEqual(expected_output_string, output_strings_xla)
    @slow
    def test_beam_search_generate(self):
        model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -1600,6 +1600,79 @@ class TFModelTesterMixin:
                model.compile(optimizer="sgd", run_eagerly=True)
                model.train_on_batch(test_batch, test_batch_labels)
    def _test_xla_generate(self, num_beams, num_return_sequences, max_length):
        def _generate_and_check_results(model, config, inputs_dict):
            if "input_ids" in inputs_dict:
                inputs = inputs_dict["input_ids"]
                # make sure there are no pad tokens in prompt, which may trigger unwanted behavior
                if config.pad_token_id is not None:
                    if config.pad_token_id == 0:
                        new_pad_token = config.pad_token_id + 1
                    else:
                        new_pad_token = config.pad_token_id - 1
                else:
                    new_pad_token = None
                inputs = tf.where(inputs != config.pad_token_id, inputs, new_pad_token)
            elif "input_features" in inputs_dict:
                inputs = inputs_dict["input_features"]
            else:
                raise ValueError("No valid generate input found in inputs_dict")
            generated = model.generate(inputs).numpy()
            generate_xla = tf.function(model.generate, jit_compile=True)
            generated_xla = generate_xla(inputs).numpy()
            self.assertListEqual(generated.tolist(), generated_xla.tolist())
        for model_class in self.all_generative_model_classes:
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            config.eos_token_id = None  # Generate until max length
            config.max_length = max_length
            config.do_sample = False
            config.num_beams = num_beams
            config.num_return_sequences = num_return_sequences
            model = model_class(config)
            if model.supports_xla_generation:
                _generate_and_check_results(model, config, inputs_dict)
            else:
                with self.assertRaises(ValueError):
                    _generate_and_check_results(model, config, inputs_dict)
    def test_xla_generate_fast(self):
        """
        Basic quick test for generate-compatible classes that confirms that XLA-generated tokens are the same as their
        non XLA counterparts.
        Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
        """
        num_beams = 1
        num_return_sequences = 1
        max_length = 10
        self._test_xla_generate(num_beams, num_return_sequences, max_length)
    @slow
    def test_xla_generate_slow(self):
        """
        Slow and challenging version of `test_xla_generate_fast` -- this test asks for several long sequences using
        beam search, with and without XLA. The two outputs should match, and a failure in this test indicates that the
        model may need further analysis if it is to be used for XLA generation.
        Either the model supports XLA generation and passes the inner test, or it raises an appropriate exception
        """
        # TODO (Joao): find the issues related to the following models. They are passing the fast test, but failing
        # the slow one.
        if any(
            [
                model in str(self).lower()
                for model in ["tfbart", "tfblenderbot", "tfmarian", "tfmbart", "tfopt", "tfpegasus"]
            ]
        ):
            return
        num_beams = 8
        num_return_sequences = 2
        max_length = 128
        self._test_xla_generate(num_beams, num_return_sequences, max_length)
    def _generate_random_bad_tokens(self, num_bad_tokens, model):
        # special tokens cannot be bad tokens
        special_tokens = []