From c72d7d91bf4899760725793421eff9da640c8527 Mon Sep 17 00:00:00 2001 From: Daniel Stancl <46073029+stancld@users.noreply.github.com> Date: Wed, 24 Aug 2022 11:51:05 +0200 Subject: [PATCH] Add TF implementation of `XGLMModel` (#16543) * Add TFXGLM models * Add todo: self.supports_xla_generation = False Co-authored-by: Daniel Stancl Co-authored-by: Daniel Stancl Co-authored-by: Joao Gante Co-authored-by: Daniel Co-authored-by: Patrick von Platen --- docs/source/en/index.mdx | 2 +- docs/source/en/model_doc/xglm.mdx | 10 + src/transformers/__init__.py | 14 + .../models/auto/modeling_tf_auto.py | 2 + src/transformers/models/xglm/__init__.py | 28 + .../models/xglm/modeling_tf_xglm.py | 1000 +++++++++++++++++ src/transformers/utils/dummy_tf_objects.py | 24 + tests/models/xglm/test_modeling_tf_xglm.py | 284 +++++ 8 files changed, 1363 insertions(+), 1 deletion(-) create mode 100644 src/transformers/models/xglm/modeling_tf_xglm.py create mode 100644 tests/models/xglm/test_modeling_tf_xglm.py diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 257eba8171..17c0437678 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -312,7 +312,7 @@ Flax), PyTorch, and/or TensorFlow. | Wav2Vec2 | ✅ | ❌ | ✅ | ✅ | ✅ | | Wav2Vec2-Conformer | ❌ | ❌ | ✅ | ❌ | ❌ | | WavLM | ❌ | ❌ | ✅ | ❌ | ❌ | -| XGLM | ✅ | ✅ | ✅ | ❌ | ✅ | +| XGLM | ✅ | ✅ | ✅ | ✅ | ✅ | | XLM | ✅ | ❌ | ✅ | ✅ | ❌ | | XLM-ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ | | XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | diff --git a/docs/source/en/model_doc/xglm.mdx b/docs/source/en/model_doc/xglm.mdx index b8c395ce02..e35bab25f8 100644 --- a/docs/source/en/model_doc/xglm.mdx +++ b/docs/source/en/model_doc/xglm.mdx @@ -64,6 +64,16 @@ This model was contributed by [Suraj](https://huggingface.co/valhalla). The orig [[autodoc]] XGLMForCausalLM - forward +## TFXGLMModel + +[[autodoc]] TFXGLMModel + - call + +## TFXGLMForCausalLM + +[[autodoc]] TFXGLMForCausalLM + - call + ## FlaxXGLMModel [[autodoc]] FlaxXGLMModel diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index d6444e0844..3281d266a2 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2567,6 +2567,14 @@ else: "TFWav2Vec2PreTrainedModel", ] ) + _import_structure["models.xglm"].extend( + [ + "TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFXGLMForCausalLM", + "TFXGLMModel", + "TFXGLMPreTrainedModel", + ] + ) _import_structure["models.xlm"].extend( [ "TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -4954,6 +4962,12 @@ if TYPE_CHECKING: TFWav2Vec2Model, TFWav2Vec2PreTrainedModel, ) + from .models.xglm import ( + TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST, + TFXGLMForCausalLM, + TFXGLMModel, + TFXGLMPreTrainedModel, + ) from .models.xlm import ( TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST, TFXLMForMultipleChoice, diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index 6f9b15c131..359e1f05c4 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -77,6 +77,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict( ("vit", "TFViTModel"), ("vit_mae", "TFViTMAEModel"), ("wav2vec2", "TFWav2Vec2Model"), + ("xglm", "TFXGLMModel"), ("xlm", "TFXLMModel"), ("xlm-roberta", "TFXLMRobertaModel"), ("xlnet", "TFXLNetModel"), @@ -161,6 +162,7 @@ TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( ("roberta", "TFRobertaForCausalLM"), ("roformer", "TFRoFormerForCausalLM"), ("transfo-xl", "TFTransfoXLLMHeadModel"), + ("xglm", "TFXGLMForCausalLM"), ("xlm", "TFXLMWithLMHeadModel"), ("xlnet", "TFXLNetLMHeadModel"), ] diff --git a/src/transformers/models/xglm/__init__.py b/src/transformers/models/xglm/__init__.py index 2ab60e4cb4..096886e5bd 100644 --- a/src/transformers/models/xglm/__init__.py +++ b/src/transformers/models/xglm/__init__.py @@ -23,6 +23,7 @@ from ...utils import ( _LazyModule, is_flax_available, is_sentencepiece_available, + is_tf_available, is_tokenizers_available, is_torch_available, ) @@ -73,6 +74,20 @@ else: ] +try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_tf_xglm"] = [ + "TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFXGLMForCausalLM", + "TFXGLMModel", + "TFXGLMPreTrainedModel", + ] + + if TYPE_CHECKING: from .configuration_xglm import XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XGLMConfig @@ -108,6 +123,19 @@ if TYPE_CHECKING: else: from .modeling_flax_xglm import FlaxXGLMForCausalLM, FlaxXGLMModel, FlaxXGLMPreTrainedModel + try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_tf_xglm import ( + TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST, + TFXGLMForCausalLM, + TFXGLMModel, + TFXGLMPreTrainedModel, + ) + else: import sys diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py new file mode 100644 index 0000000000..1b6828b098 --- /dev/null +++ b/src/transformers/models/xglm/modeling_tf_xglm.py @@ -0,0 +1,1000 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TF 2.0 XGLM model.""" + + +import math +import random +from typing import Any, Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation + +# Public API +from ...file_utils import ( + DUMMY_INPUTS, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from ...modeling_tf_outputs import TFBaseModelOutputWithPastAndCrossAttentions, TFCausalLMOutputWithCrossAttentions +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFModelInputType, + TFPreTrainedModel, + TFSharedEmbeddings, + get_initializer, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import shape_list, stable_softmax +from ...utils import logging +from .configuration_xglm import XGLMConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "facebook/xglm-564M" +_CONFIG_FOR_DOC = "XGLMConfig" +_TOKENIZER_FOR_DOC = "XGLMTokenizer" + + +TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "facebook/xglm-564M", + # See all XGLM models at https://huggingface.co/models?filter=xglm +] + + +LARGE_NEGATIVE = -1e8 + + +def create_sinusiodal_positions(num_positions: int, embedding_dim: int, padding_idx: Optional[int]) -> tf.Tensor: + half_dim = embedding_dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = tf.exp(tf.range(half_dim, dtype=tf.float32) * -emb) + emb = tf.expand_dims(tf.range(num_positions, dtype=tf.float32), axis=1) * tf.expand_dims(emb, axis=0) + emb = tf.reshape(tf.concat([tf.sin(emb), tf.cos(emb)], axis=1), (num_positions, -1)) + if embedding_dim % 2 == 1: + # zero pad + emb = tf.concat([emb, tf.zeros((num_positions, 1))], axis=1) + if padding_idx is not None: + _padding_mask = tf.concat( + [ + tf.ones((padding_idx, shape_list(emb)[1])), + tf.zeros((1, shape_list(emb)[1])), + tf.ones((shape_list(emb)[0] - padding_idx - 1, shape_list(emb)[1])), + ], + axis=0, + ) + emb *= _padding_mask + + return tf.Variable(emb, trainable=False, name="model.embed_positions.weights") + + +def _create_position_ids_from_input_ids( + input_ids: tf.Tensor, past_key_values_length: int, padding_idx: Optional[int] +) -> tf.Tensor: + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = tf.where(input_ids != padding_idx, 1, 0) + incremental_indices = (tf.cast(tf.cumsum(mask, axis=1), dtype=mask.dtype) + past_key_values_length) * mask + return tf.cast(incremental_indices, dtype=tf.int64) + padding_idx + + +def _create_position_ids_from_inputs_embeds( + inputs_embeds: tf.Tensor, past_key_values_length: int, padding_idx: Optional[int] +) -> tf.Tensor: + """ + Args: + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + inputs_embeds: tf.Tensor + Returns: tf.Tensor + """ + input_shape = shape_list(inputs_embeds)[:-1] + sequence_length = input_shape[1] + + position_ids = tf.range(padding_idx + 1, sequence_length + padding_idx + 1, dtype=tf.int64) + + return tf.broadcast_to(tf.expand_dims(position_ids, axis=0), input_shape) + past_key_values_length + + +# Copied from transformers.models.bart.modeling_tf_bart._make_causal_mask +def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz = input_ids_shape[0] + tgt_len = input_ids_shape[1] + mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE + mask_cond = tf.range(shape_list(mask)[-1]) + + mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask) + + if past_key_values_length > 0: + mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) + + +# Copied from transformers.models.bart.modeling_tf_bart._expand_mask +def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values_length: int = 0): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + src_len = shape_list(mask)[1] + tgt_len = tgt_len if tgt_len is not None else src_len + one_cst = tf.constant(1.0) + mask = tf.cast(mask, dtype=one_cst.dtype) + expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)) + + return (one_cst - expanded_mask) * LARGE_NEGATIVE + + +# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->XGLM +class TFXGLMAttention(tf.keras.layers.Layer): + """Multi-headed attention from "Attention Is All You Need""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + + self.num_heads = num_heads + self.dropout = tf.keras.layers.Dropout(dropout) + self.head_dim = embed_dim // num_heads + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + + def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): + return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) + + def call( + self, + hidden_states: tf.Tensor, + key_value_states: Optional[tf.Tensor] = None, + past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None, + attention_mask: Optional[tf.Tensor] = None, + layer_head_mask: Optional[tf.Tensor] = None, + training: Optional[bool] = False, + ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, embed_dim = shape_list(hidden_states) + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = tf.concat([past_key_value[0], key_states], axis=2) + value_states = tf.concat([past_key_value[1], value_states], axis=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape) + key_states = tf.reshape(key_states, proj_shape) + value_states = tf.reshape(value_states, proj_shape) + + src_len = shape_list(key_states)[1] + attn_weights = tf.matmul(query_states, key_states, transpose_b=True) + + # The tf.debugging asserts are not compliant with XLA then they + # have to be disabled in other modes than eager. + if tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(attn_weights), + [bsz * self.num_heads, tgt_len, src_len], + message=( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {shape_list(attn_weights)}" + ), + ) + + if attention_mask is not None: + # The tf.debugging asserts are not compliant with XLA then they + # have to be disabled in other modes than eager. + if tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(attention_mask), + [bsz, 1, tgt_len, src_len], + message=( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" + f" {shape_list(attention_mask)}" + ), + ) + + attention_mask = tf.cast(attention_mask, dtype=attn_weights.dtype) + attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_weights = stable_softmax(attn_weights, axis=-1) + + if layer_head_mask is not None: + # The tf.debugging asserts are not compliant with XLA then they + # have to be disabled in other modes than eager. + if tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(layer_head_mask), + [self.num_heads], + message=( + f"Head mask for a single layer should be of size {(self.num_heads)}, but is" + f" {shape_list(layer_head_mask)}" + ), + ) + + attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape( + attn_weights, (bsz, self.num_heads, tgt_len, src_len) + ) + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_probs = self.dropout(attn_weights, training=training) + attn_output = tf.matmul(attn_probs, value_states) + + # The tf.debugging asserts are not compliant with XLA then they + # have to be disabled in other modes than eager. + if tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(attn_output), + [bsz * self.num_heads, tgt_len, self.head_dim], + message=( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {shape_list(attn_output)}" + ), + ) + + attn_output = tf.transpose( + tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3) + ) + attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim)) + + attn_output = self.out_proj(attn_output) + attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + + return attn_output, attn_weights, past_key_value + + +class TFXGLMDecoderLayer(tf.keras.layers.Layer): + def __init__(self, config: XGLMConfig, **kwargs: Any) -> None: + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TFXGLMAttention( + embed_dim=self.embed_dim, + num_heads=config.attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + name="self_attn", + ) + self.dropout = tf.keras.layers.Dropout(config.dropout) + self.activation_fn = get_tf_activation(config.activation_function) + self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + + if config.add_cross_attention: + self.encoder_attn = TFXGLMAttention( + embed_dim=self.embed_dim, + num_heads=config.attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + name="encoder_attn", + ) + self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization( + epsilon=1e-5, name="encoder_attn_layer_norm" + ) + + self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.fc1 = tf.keras.layers.Dense(config.ffn_dim, name="fc1") + self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + + # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer.call + def call( + self, + hidden_states: tf.Tensor, + attention_mask: Optional[tf.Tensor] = None, + encoder_hidden_states: Optional[tf.Tensor] = None, + encoder_attention_mask: Optional[tf.Tensor] = None, + layer_head_mask: Optional[tf.Tensor] = None, + cross_attn_layer_head_mask: Optional[tf.Tensor] = None, + past_key_value: Optional[Tuple[tf.Tensor]] = None, + training: Optional[bool] = False, + ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape *(seq_len, batch, embed_dim)* + attention_mask (`tf.Tensor`): attention mask of size + *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. + encoder_hidden_states (`tf.Tensor`): + cross attention input to the layer of shape *(seq_len, batch, embed_dim)* + encoder_attention_mask (`tf.Tensor`): encoder attention mask of size + *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. + layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size + *(decoder_attention_heads,)* + cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module. + *(decoder_attention_heads,)* + past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + return ( + hidden_states, + self_attn_weights, + cross_attn_weights, + present_key_value, + ) + + +@keras_serializable +class TFXGLMMainLayer(tf.keras.layers.Layer): + config_class = XGLMConfig + + def __init__( + self, config: XGLMConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, *inputs, **kwargs: Any + ) -> None: + super().__init__(*inputs, **kwargs) + + self.config = config + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + self.embed_tokens = embed_tokens + else: + self.embed_tokens = TFSharedEmbeddings( + config.vocab_size, config.d_model, self.padding_idx, name="embed_tokens" + ) + + self.offset = 2 + self._embed_positions_weights = create_sinusiodal_positions( + num_positions=config.max_position_embeddings + self.offset, + embedding_dim=config.d_model, + padding_idx=config.pad_token_id, + ) + + self.dropout = tf.keras.layers.Dropout(config.dropout) + self.layers = [TFXGLMDecoderLayer(config, name=f"layers.{i}") for i in range(config.num_layers)] + self.layerdrop = config.layerdrop + self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + + def get_input_embeddings(self) -> TFSharedEmbeddings: + return self.embed_tokens + + def set_input_embeddings(self, value: TFSharedEmbeddings) -> None: + self.embed_tokens = value + + def _prepare_decoder_attention_mask( + self, + attention_mask: Optional[tf.Tensor], + input_shape: tf.TensorShape, + past_key_values_length: int, + ) -> tf.Tensor: + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask: Optional[tf.Tensor] = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length) + + if attention_mask is not None: + expand_attention_mask = _expand_mask(attention_mask, tgt_len=input_shape[-1]) + combined_attention_mask = ( + expand_attention_mask + if combined_attention_mask is None + else expand_attention_mask + combined_attention_mask + ) + + return combined_attention_mask + + def embed_positions( + self, + input_ids: Optional[TFModelInputType] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + past_key_values_length: Optional[int] = None, + ) -> tf.Tensor: + if input_ids is not None: + position_ids = _create_position_ids_from_input_ids(input_ids, past_key_values_length, self.padding_idx) + else: + position_ids = _create_position_ids_from_inputs_embeds( + inputs_embeds, past_key_values_length, self.padding_idx + ) + + positions = tf.gather(self._embed_positions_weights, position_ids, axis=0) + return positions + + @unpack_inputs + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None, + encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + **kwargs: Any, + ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + input_ids = tf.reshape(input_ids, (-1, input_shape[-1])) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1]) + + # embed positions + positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length) + + hidden_states = tf.cast(inputs_embeds, dtype=tf.float32) + positions + + hidden_states = self.dropout(hidden_states, training=training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + next_decoder_cache = () if use_cache else None + + # check if head_mask and cross_attn_head_mask have a correct number of layers specified if desired + # The tf.debugging asserts are not compliant with XLA then they + # have to be disabled in other modes than eager. + for attn_mask_name, attn_mask in [("head_mask", head_mask), ("cross_attn_head_mask", cross_attn_head_mask)]: + if attn_mask is not None and tf.executing_eagerly(): + tf.debugging.assert_equal( + shape_list(attn_mask)[0], + len(self.layers), + message=( + f"The {attn_mask_name} should be specified for {len(self.layers)} layers, but it is for" + f" {shape_list(attn_mask)[0]}." + ), + ) + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + + dropout_probability = random.uniform(0, 1) + if training and (dropout_probability < self.layerdrop): + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + hidden_states, layer_self_attn, layer_cross_attn, present_key_value = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None), + past_key_value=past_key_value, + ) + + if use_cache: + next_decoder_cache += (present_key_value,) + + if output_attentions: + all_self_attns += (layer_self_attn,) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_cross_attn,) + + hidden_states = self.layer_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +class TFXGLMPreTrainedModel(TFPreTrainedModel): + config_class = XGLMConfig + base_model_prefix = "model" + + @property + def dummy_inputs(self): + pad_token = 1 + input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32) + dummy_inputs = { + "input_ids": input_ids, + "attention_mask": tf.math.not_equal(input_ids, pad_token), + } + return dummy_inputs + + @tf.function( + input_signature=[ + { + "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + } + ] + ) + def serving(self, inputs): + output = self.call(inputs) + + return self.serving_output(output) + + +XGLM_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the + tensors in the first argument of the model call function: `model(inputs)`. + + If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the + first positional argument : + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + + + Args: + config ([`XGLMConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +XGLM_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`XGLMTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of + the decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`tf.Tensor` of shape `(num_layers, attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`tf.Tensor` of shape `(num_layers, attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.num_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare XGLM Model transformer outputting raw hidden-states without any specific head on top.", + XGLM_START_DOCSTRING, +) +class TFXGLMModel(TFXGLMPreTrainedModel): + """ + Transformer decoder consisting of *config.num_layers* layers. Each layer is a [`TFXGLMDecoderLayer`] + + Args: + config: XGLMConfig + embed_tokens: [TFSharedEmbeddings]: output embedding + """ + + def __init__( + self, config: XGLMConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, *inputs: Any, **kwargs: Any + ) -> None: + super().__init__(config, *inputs, **kwargs) + + self.model = TFXGLMMainLayer(config, embed_tokens=embed_tokens, name="model") + + @unpack_inputs + @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None, + encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + **kwargs: Any, + ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]: + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def serving_output(self, output): + pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + cross_attns = ( + tf.convert_to_tensor(output.cross_attentions) + if self.config.output_attentions and self.config.add_cross_attention + else None + ) + + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=output.hidden_states, + past_key_values=pkv, + hidden_states=hs, + attentions=attns, + cross_attentions=cross_attns, + ) + + +@add_start_docstrings( + """ + The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + XGLM_START_DOCSTRING, +) +class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss): + base_model_prefix = "model" + _keys_to_ignore_on_load_missing = [ + r"model.embed_positions.weights", + r"lm_head.weight", + ] + _keys_to_ignore_on_save = [ + r"model.embed_positions.weights", + ] + + def __init__( + self, config: XGLMConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, *inputs: Any, **kwargs: Any + ) -> None: + super().__init__(config, *inputs, **kwargs) + + self.model = TFXGLMMainLayer(config, embed_tokens=embed_tokens, name="model") + self.lm_head = tf.keras.layers.Dense( + config.vocab_size, + use_bias=False, + kernel_initializer=get_initializer(config.init_std), + name="lm_head", + ) + + # TODO (Joao): investigate why XGLM has numerical issues in XLA generate + self.supports_xla_generation = False + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def prepare_inputs_for_generation(self, inputs, past=None, use_cache=None, **kwargs): + # only last token for inputs_ids if past is defined in kwargs + if past: + inputs = tf.expand_dims(inputs[:, -1], -1) + + attention_mask = kwargs.get("attention_mask", None) + + return { + "input_ids": inputs, + "attention_mask": attention_mask, + "past": past, + "use_cache": use_cache, + } + + @unpack_inputs + @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFCausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + encoder_hidden_states: Optional[Union[np.ndarray, tf.Tensor]] = None, + encoder_attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + cross_attn_head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + **kwargs: Any, + ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]: + r""" + labels (`np.ndarray` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + hidden_states = outputs[0] + lm_logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # shift labels to the left and cut last logit token + shifted_logits = lm_logits[:, :-1] + labels = labels[:, 1:] + loss = self.hf_compute_loss(labels, shifted_logits) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithCrossAttentions( + loss=loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def serving_output(self, output): + pkv = tf.convert_to_tensor(output.past_key_values) if self.config.use_cache else None + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + cross_attns = ( + tf.convert_to_tensor(output.cross_attentions) + if self.config.output_attentions and self.config.add_cross_attention + else None + ) + + return TFCausalLMOutputWithCrossAttentions( + loss=output.loss, + logits=output.logits, + past_key_values=pkv, + hidden_states=hs, + attentions=attns, + cross_attentions=cross_attns, + ) + + @staticmethod + def _reorder_cache(past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple(tf.gather(past_state, beam_idx, axis=0) for past_state in layer_past),) + return reordered_past diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index fec5ffe700..5f8124ae55 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -2270,6 +2270,30 @@ class TFWav2Vec2PreTrainedModel(metaclass=DummyObject): requires_backends(self, ["tf"]) +TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class TFXGLMForCausalLM(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFXGLMModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFXGLMPreTrainedModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py new file mode 100644 index 0000000000..b6387901dc --- /dev/null +++ b/tests/models/xglm/test_modeling_tf_xglm.py @@ -0,0 +1,284 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers import XGLMConfig, XGLMTokenizer, is_tf_available +from transformers.testing_utils import require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask + + +if is_tf_available(): + import tensorflow as tf + + from transformers.models.xglm.modeling_tf_xglm import ( + TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST, + TFXGLMForCausalLM, + TFXGLMModel, + ) + + +@require_tf +class TFXGLMModelTester: + config_cls = XGLMConfig + config_updates = {} + hidden_act = "gelu" + + def __init__( + self, + parent, + batch_size=14, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + d_model=32, + num_hidden_layers=5, + num_attention_heads=4, + ffn_dim=37, + activation_function="gelu", + activation_dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = d_model + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.ffn_dim = ffn_dim + self.activation_function = activation_function + self.activation_dropout = activation_dropout + self.attention_dropout = attention_dropout + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.scope = None + self.bos_token_id = 0 + self.eos_token_id = 2 + self.pad_token_id = 1 + + def get_large_model_config(self): + return XGLMConfig.from_pretrained("facebook/xglm-564M") + + def prepare_config_and_inputs(self): + input_ids = tf.clip_by_value( + ids_tensor([self.batch_size, self.seq_length], self.vocab_size), clip_value_min=0, clip_value_max=3 + ) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + config = self.get_config() + + head_mask = floats_tensor([self.num_hidden_layers, self.num_attention_heads], 2) + + return ( + config, + input_ids, + input_mask, + head_mask, + ) + + def get_config(self): + return XGLMConfig( + vocab_size=self.vocab_size, + d_model=self.hidden_size, + num_layers=self.num_hidden_layers, + attention_heads=self.num_attention_heads, + ffn_dim=self.ffn_dim, + activation_function=self.activation_function, + activation_dropout=self.activation_dropout, + attention_dropout=self.attention_dropout, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + use_cache=True, + bos_token_id=self.bos_token_id, + eos_token_id=self.eos_token_id, + pad_token_id=self.pad_token_id, + return_dict=True, + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + + ( + config, + input_ids, + input_mask, + head_mask, + ) = config_and_inputs + + inputs_dict = { + "input_ids": input_ids, + "head_mask": head_mask, + } + + return config, inputs_dict + + +@require_tf +class TFXGLMModelTest(TFModelTesterMixin, unittest.TestCase): + + all_model_classes = (TFXGLMModel, TFXGLMForCausalLM) if is_tf_available() else () + all_generative_model_classes = (TFXGLMForCausalLM,) if is_tf_available() else () + test_onnx = False + test_missing_keys = False + test_pruning = False + + def setUp(self): + self.model_tester = TFXGLMModelTester(self) + self.config_tester = ConfigTester(self, config_class=XGLMConfig, n_embd=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model_common_attributes(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) + + if model_class in self.all_generative_model_classes: + x = model.get_output_embeddings() + assert isinstance(x, tf.keras.layers.Layer) + name = model.get_bias() + assert name is None + else: + x = model.get_output_embeddings() + assert x is None + name = model.get_bias() + assert name is None + + @slow + def test_batch_generation(self): + model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M") + tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M") + + tokenizer.padding_side = "left" + + # use different length sentences to test batching + sentences = [ + "Hello, my dog is a little", + "Today, I", + ] + + inputs = tokenizer(sentences, return_tensors="tf", padding=True) + + outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]) + + inputs_non_padded = tokenizer(sentences[0], return_tensors="tf").input_ids + output_non_padded = model.generate(input_ids=inputs_non_padded) + + num_paddings = ( + inputs_non_padded.shape[-1] + - tf.math.reduce_sum(tf.cast(inputs["attention_mask"][-1], dtype=tf.int64)).numpy() + ) + inputs_padded = tokenizer(sentences[1], return_tensors="tf").input_ids + output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings) + + batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) + non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True) + padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True) + + expected_output_sentence = [ + "Hello, my dog is a little bit of a shy one, but he is very friendly", + "Today, I am going to share with you a few of my favorite things", + ] + self.assertListEqual(expected_output_sentence, batch_out_sentence) + self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence]) + + @slow + def test_model_from_pretrained(self): + for model_name in TF_XGLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TFXGLMModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + @unittest.skip(reason="Currently, model embeddings are going to undergo a major refactor.") + def test_resize_token_embeddings(self): + super().test_resize_token_embeddings() + + +@require_tf +class TFXGLMModelLanguageGenerationTest(unittest.TestCase): + @slow + def test_lm_generate_xglm(self, verify_outputs=True): + model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M") + input_ids = tf.convert_to_tensor([[2, 268, 9865]], dtype=tf.int32) # The dog + # The dog is a very friendly dog. He is very affectionate and loves to play with other + # fmt: off + expected_output_ids = [2, 268, 9865, 67, 11, 1988, 57252, 9865, 5, 984, 67, 1988, 213838, 1658, 53, 70446, 33, 6657, 278, 1581] + # fmt: on + output_ids = model.generate(input_ids, do_sample=False, num_beams=1) + if verify_outputs: + self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids) + + @slow + def test_xglm_sample(self): + tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M") + model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M") + + tf.random.set_seed(0) + tokenized = tokenizer("Today is a nice day and", return_tensors="tf") + input_ids = tokenized.input_ids + output_ids = model.generate(input_ids, do_sample=True, seed=[7, 0]) + output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True) + + EXPECTED_OUTPUT_STR = ( + "Today is a nice day and warm evening here over Southern Alberta!! Today when they closed schools due" + ) + self.assertEqual(output_str, EXPECTED_OUTPUT_STR) + + @slow + def test_lm_generate_xglm_left_padding(self): + """Tests that the generated text is the same, regarless of left padding""" + tokenizer = XGLMTokenizer.from_pretrained("facebook/xglm-564M") + model = TFXGLMForCausalLM.from_pretrained("facebook/xglm-564M") + + tokenizer.padding_side = "left" + + generation_kwargs = { + "bad_words_ids": [tokenizer("is").input_ids, tokenizer("angry about").input_ids], + "no_repeat_ngram_size": 2, + "do_sample": False, + "repetition_penalty": 1.3, + } + expected_output_string = ( + "Today is a beautiful day and I am so glad that we have the opportunity to spend time with" + ) + + sentences = ["Today is a beautiful day and"] + input_ids = tokenizer(sentences, return_tensors="tf", padding=True) + # using default length + output_ids = model.generate(**input_ids, **generation_kwargs) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertEqual(output_strings[0], expected_output_string) + + sentences = ["Today is a beautiful day and", "This is a very long input that we absolutely don't care about"] + input_ids = tokenizer(sentences, return_tensors="tf", padding=True) + # longer max length to capture the full length (remember: it is left padded) + output_ids = model.generate(**input_ids, **generation_kwargs, max_length=28) + output_strings = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + self.assertEqual(output_strings[0], expected_output_string)