From cbe63949d76efd153a1f389f38fe9ce1287e06b0 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 22 Dec 2020 23:41:20 +0100 Subject: [PATCH] Model Templates for Seq2Seq (#9251) * adapt cookie cutter * fix copy past statement * delete copy statements for now * remove unused import from template * make doc rst * correct config docstring * correct training * correct inputs processing tf enc dec * make style * adapt templates * clean tabs * correct tensor -> Tensor naming * correct indent * correct templates * fix the test * break lines to avoid > 119 * Apply suggestions from code review --- .github/workflows/model-templates.yml | 2 + .../models/bart/modeling_tf_bart.py | 3 +- .../__init__.py | 23 +- .../configuration.json | 3 +- ...on_{{cookiecutter.lowercase_modelname}}.py | 110 +- ...tf_{{cookiecutter.lowercase_modelname}}.py | 1169 ++++++++++++++ ...ng_{{cookiecutter.lowercase_modelname}}.py | 1357 +++++++++++++++++ ...tf_{{cookiecutter.lowercase_modelname}}.py | 270 ++++ ...ng_{{cookiecutter.lowercase_modelname}}.py | 393 +++++ ...ce_{{cookiecutter.lowercase_modelname}}.py | 90 +- ...st_{{cookiecutter.lowercase_modelname}}.py | 41 + ...on_{{cookiecutter.lowercase_modelname}}.py | 40 +- .../{{cookiecutter.lowercase_modelname}}.rst | 38 +- .../adding_a_new_model/cookiecutter.json | 7 +- .../tests/encoder-bert-tokenizer.json | 3 +- .../tests/pt-encoder-bert-tokenizer.json | 3 +- .../tests/pt-seq-2-seq-bart-tokenizer.json | 11 + .../adding_a_new_model/tests/standalone.json | 3 +- .../tests/tf-encoder-bert-tokenizer.json | 3 +- .../tests/tf-seq-2-seq-bart-tokenizer.json | 11 + tests/test_modeling_tf_bart.py | 2 +- 21 files changed, 3561 insertions(+), 21 deletions(-) create mode 100644 templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json create mode 100644 templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json diff --git a/.github/workflows/model-templates.yml b/.github/workflows/model-templates.yml index 2afed1366b..71ddf9ab4d 100644 --- a/.github/workflows/model-templates.yml +++ b/.github/workflows/model-templates.yml @@ -40,6 +40,8 @@ jobs: transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model + transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model + transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model make style python utils/check_table.py --fix_and_overwrite python utils/check_dummies.py --fix_and_overwrite diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index a4658ac39d..03c24c209c 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -1077,7 +1077,7 @@ class TFBartModel(TFBartPretrainedModel): decoder_outputs = self.decoder( inputs["decoder_input_ids"], - attention_mask=decoder_attention_mask, + attention_mask=inputs["decoder_attention_mask"], encoder_hidden_states=inputs["encoder_outputs"][0], encoder_attention_mask=inputs["attention_mask"], past_key_values=inputs["past_key_values"], @@ -1228,6 +1228,7 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel): output_attentions=inputs["output_attentions"], output_hidden_states=inputs["output_hidden_states"], return_dict=inputs["return_dict"], + training=inputs["training"], ) lm_logits = self.model.shared(outputs[0], mode="linear") lm_logits = lm_logits + self.final_logits_bias diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py index 03a115a4cb..bb46df654d 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py @@ -30,6 +30,7 @@ if is_tokenizers_available(): from .tokenization_{{cookiecutter.lowercase_modelname}}_fast import {{cookiecutter.camelcase_modelname}}TokenizerFast {%- if (cookiecutter.generate_tensorflow_and_pytorch == "PyTorch & TensorFlow" or cookiecutter.generate_tensorflow_and_pytorch == "PyTorch") %} +{% if cookiecutter.is_encoder_decoder_model == "False" %} if is_torch_available(): from .modeling_{{cookiecutter.lowercase_modelname}} import ( {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST, @@ -44,8 +45,20 @@ if is_torch_available(): {{cookiecutter.camelcase_modelname}}PreTrainedModel, load_tf_weights_in_{{cookiecutter.lowercase_modelname}}, ) +{% else %} +if is_torch_available(): + from .modeling_{{cookiecutter.lowercase_modelname}} import ( + {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST, + {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, + {{cookiecutter.camelcase_modelname}}ForQuestionAnswering, + {{cookiecutter.camelcase_modelname}}ForSequenceClassification, + {{cookiecutter.camelcase_modelname}}Model, + {{cookiecutter.camelcase_modelname}}PreTrainedModel, + ) +{% endif %} {% endif %} {%- if (cookiecutter.generate_tensorflow_and_pytorch == "PyTorch & TensorFlow" or cookiecutter.generate_tensorflow_and_pytorch == "TensorFlow") %} +{% if cookiecutter.is_encoder_decoder_model == "False" %} if is_tf_available(): from .modeling_tf_{{cookiecutter.lowercase_modelname}} import ( TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST, @@ -59,4 +72,12 @@ if is_tf_available(): TF{{cookiecutter.camelcase_modelname}}Model, TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, ) -{% endif %} \ No newline at end of file +{% else %} +if is_tf_available(): + from .modeling_tf_{{cookiecutter.lowercase_modelname}} import ( + TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration, + TF{{cookiecutter.camelcase_modelname}}Model, + TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, + ) +{% endif %} +{% endif %} diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json index 71c31a09c9..72ab9681d3 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json @@ -6,5 +6,6 @@ "authors": "{{cookiecutter.authors}}", "checkpoint_identifier": "{{cookiecutter.checkpoint_identifier}}", "tokenizer_type": "{{cookiecutter.tokenizer_type}}", - "generate_tensorflow_and_pytorch": "{{cookiecutter.generate_tensorflow_and_pytorch}}" + "generate_tensorflow_and_pytorch": "{{cookiecutter.generate_tensorflow_and_pytorch}}", + "is_encoder_decoder_model": ["True", "False"] } diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py index e9279b9788..ebb1cd1637 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py @@ -39,6 +39,7 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig): Args: + {% if cookiecutter.is_encoder_decoder_model == "False" -%} vocab_size (:obj:`int`, `optional`, defaults to 30522): Vocabulary size of the {{cookiecutter.modelname}} model. Defines the number of different tokens that can be represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or @@ -70,6 +71,50 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12): The epsilon used by the layer normalization layers. + {% else -%} + vocab_size (:obj:`int`, `optional`, defaults to 50265): + Vocabulary size of the {{cookiecutter.modelname}} model. Defines the number of different tokens that can be represented by the + :obj:`inputs_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or + :class:`~transformers.TF{{cookiecutter.camelcase_modelname}}Model`. + d_model (:obj:`int`, `optional`, defaults to 1024): + Dimensionality of the layers and the pooler layer. + encoder_layers (:obj:`int`, `optional`, defaults to 12): + Number of encoder layers. + decoder_layers (:obj:`int`, `optional`, defaults to 12): + Number of decoder layers. + encoder_attention_heads (:obj:`int`, `optional`, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (:obj:`int`, `optional`, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. + dropout (:obj:`float`, `optional`, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + classifier_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout ratio for classifier. + max_position_embeddings (:obj:`int`, `optional`, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + init_std (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0): + The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details. + decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0): + The LayerDrop probability for the decoder. See the `LayerDrop paper `__ for more details. + use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not the model should return the last key/values attentions (not used by all models). + {% endif -%} Example:: @@ -88,9 +133,9 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig): def __init__( self, + {% if cookiecutter.is_encoder_decoder_model == "False" -%} vocab_size=30522, hidden_size=768, - is_encoder_decoder=False, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, @@ -101,6 +146,29 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig): type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, + is_encoder_decoder=False, + {% else -%} + vocab_size=50265, + max_position_embeddings=1024, + encoder_layers=12, + encoder_ffn_dim=4096, + encoder_attention_heads=16, + decoder_layers=12, + decoder_ffn_dim=4096, + decoder_attention_heads=16, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + use_cache=True, + is_encoder_decoder=True, + activation_function="gelu", + d_model=1024, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + decoder_start_token_id=2, + classifier_dropout=0.0, + {% endif -%} pad_token_id=1, bos_token_id=0, eos_token_id=2, @@ -108,13 +176,19 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig): ): super().__init__( pad_token_id=pad_token_id, - is_encoder_decoder=is_encoder_decoder, bos_token_id=bos_token_id, eos_token_id=eos_token_id, + {% if cookiecutter.is_encoder_decoder_model == "False" -%} + {% else -%} + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + {% endif -%} **kwargs ) self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + {% if cookiecutter.is_encoder_decoder_model == "False" -%} self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads @@ -122,8 +196,36 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig): self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range + self.type_vocab_size = type_vocab_size self.layer_norm_eps = layer_norm_eps + {% else -%} + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.classifier_dropout = classifier_dropout + self.use_cache = use_cache + self.num_hidden_layers = encoder_layers + {% endif -%} + {% if cookiecutter.is_encoder_decoder_model == "False" %} + {%- else %} + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model + {%- endif %} diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 5d505371d0..5fd7f26c46 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -14,6 +14,8 @@ # limitations under the License. """ TF 2.0 {{cookiecutter.modelname}} model. """ +{% if cookiecutter.is_encoder_decoder_model == "False" %} + import tensorflow as tf @@ -1395,3 +1397,1170 @@ class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutte hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + +{% else %} +import random +from typing import Dict, Optional, Tuple, Union + +import tensorflow as tf + +from ...activations_tf import ACT2FN +from ...file_utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPast, + TFSeq2SeqLMOutput, + TFSeq2SeqModelOutput, +) + +# Public API +from ...modeling_tf_utils import ( + DUMMY_INPUTS, + TFPreTrainedModel, + TFSharedEmbeddings, + TFWrappedEmbeddings, + input_processing, + keras_serializable, + shape_list, +) +from ...utils import logging +from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config" +_TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer" + + +LARGE_NEGATIVE = -1e8 + + +def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): + shifted_input_ids = tf.cast(input_ids, tf.int32) + shifted_input_ids = tf.roll(shifted_input_ids, 1, axis=-1) + start_tokens = tf.fill((shape_list(shifted_input_ids)[0], 1), decoder_start_token_id) + shifted_input_ids = tf.concat([start_tokens, shifted_input_ids[:, 1:]], -1) + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids = tf.where( + shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + ) + + # "Verify that `labels` has only positive values and -100" + assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.cast(0, tf.int32)) + + # Make sure the assertion op is called by wrapping the result in an identity no-op + with tf.control_dependencies([assert_gte0]): + shifted_input_ids = tf.identity(shifted_input_ids) + + return shifted_input_ids + + +def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = tf.ones((tgt_len, tgt_len), dtype=tf.float32) * LARGE_NEGATIVE + mask_cond = tf.range(shape_list(mask)[-1]) + + mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask) + mask = tf.cast(mask, tf.float32) + + if past_key_values_length > 0: + mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=tf.float32), mask], axis=-1) + return tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length)) + + +def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values_length: int = 0): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = shape_list(mask) + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = tf.cast(tf.broadcast_to(mask[:, None, None, :], (bsz, 1, tgt_len, src_len)), tf.float32) + + return (1.0 - expanded_mask) * LARGE_NEGATIVE + + +class TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(TFSharedEmbeddings): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, **kwargs): + assert padding_idx is not None, "padding_idx cannot be None" + super().__init__(num_embeddings, embedding_dim, **kwargs) + + def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0): + """Input is expected to be of size [bsz x seqlen].""" + bsz, seq_len = input_shape[:2] + + positions = tf.range( + past_key_values_length, seq_len + past_key_values_length, delta=1, dtype=tf.int32, name="range" + ) + return super().call(positions) + + +class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): + """Multi-headed attention from "Attention Is All You Need""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + + self.num_heads = num_heads + self.dropout = tf.keras.layers.Dropout(dropout) + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + self.scaling = self.head_dim ** -0.5 + self.is_decoder = is_decoder + + self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + + def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): + return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) + + def call( + self, + hidden_states: tf.Tensor, + key_value_states: Optional[tf.Tensor] = None, + past_key_value: Optional[Tuple[Tuple[tf.Tensor]]] = None, + attention_mask: Optional[tf.Tensor] = None, + training=False, + ) -> Tuple[tf.Tensor, Optional[tf.Tensor]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, embed_dim = shape_list(hidden_states) + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = tf.concat([past_key_value[0], key_states], axis=2) + value_states = tf.concat([past_key_value[1], value_states], axis=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape) + key_states = tf.reshape(key_states, proj_shape) + value_states = tf.reshape(value_states, proj_shape) + + src_len = shape_list(key_states)[1] + attn_weights = tf.matmul(query_states, key_states, transpose_b=True) + + tf.debugging.assert_equal( + shape_list(attn_weights), + [bsz * self.num_heads, tgt_len, src_len], + message=f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {shape_list(attn_weights)}", + ) + + if attention_mask is not None: + tf.debugging.assert_equal( + shape_list(attention_mask), + [bsz, 1, tgt_len, src_len], + message=f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {shape_list(attention_mask)}", + ) + attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_weights = tf.nn.softmax(attn_weights, axis=-1) + + attn_probs = self.dropout(attn_weights, training=training) + + attn_output = tf.matmul(attn_probs, value_states) + + tf.debugging.assert_equal( + shape_list(attn_output), + [bsz * self.num_heads, tgt_len, self.head_dim], + message=f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {shape_list(attn_output)}", + ) + + attn_output = tf.transpose( + tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3) + ) + attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim)) + + attn_output = self.out_proj(attn_output) + attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + + return attn_output, attn_weights, past_key_value + + +class TF{{cookiecutter.camelcase_modelname}}EncoderLayer(tf.keras.layers.Layer): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TF{{cookiecutter.camelcase_modelname}}Attention( + self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" + ) + self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = tf.keras.layers.Dropout(config.dropout) + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + + def call(self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, training=False): + """ + Args: + hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (:obj:`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + """ + residual = hidden_states + hidden_states, self_attn_weights, _ = self.self_attn( + hidden_states=hidden_states, attention_mask=attention_mask + ) + tf.debugging.assert_equal( + shape_list(hidden_states), + shape_list(residual), + message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}", + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + return hidden_states, self_attn_weights + + +class TF{{cookiecutter.camelcase_modelname}}DecoderLayer(tf.keras.layers.Layer): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TF{{cookiecutter.camelcase_modelname}}Attention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + name="self_attn", + is_decoder=True, + ) + self.dropout = tf.keras.layers.Dropout(config.dropout) + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout) + + self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.encoder_attn = TF{{cookiecutter.camelcase_modelname}}Attention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + name="encoder_attn", + is_decoder=True, + ) + self.encoder_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = tf.keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + + def call( + self, + hidden_states, + attention_mask: Optional[tf.Tensor] = None, + encoder_hidden_states: Optional[tf.Tensor] = None, + encoder_attention_mask: Optional[tf.Tensor] = None, + past_key_value: Optional[Tuple[tf.Tensor]] = None, + training=False, + ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: + """ + Args: + hidden_states (:obj:`tf.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (:obj:`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (:obj:`tf.Tensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (:obj:`tf.Tensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + past_key_value (:obj:`Tuple(tf.Tensor)`): cached past key and value projection states + """ + residual = hidden_states + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_present_key_value = None + if encoder_hidden_states is not None: + residual = hidden_states + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, _, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + past_key_value=cross_attn_past_key_value, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + return ( + hidden_states, + self_attn_weights, + present_key_value, + ) + + +class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel): + config_class = {{cookiecutter.camelcase_modelname}}Config + base_model_prefix = "model" + + @property + def dummy_inputs(self): + pad_token = 1 + input_ids = tf.cast(tf.constant(DUMMY_INPUTS), tf.int32) + decoder_input_ids = tf.cast(tf.constant(DUMMY_INPUTS), tf.int32) + dummy_inputs = { + "decoder_input_ids": decoder_input_ids, + "attention_mask": tf.math.not_equal(input_ids, pad_token), + "input_ids": input_ids, + } + return dummy_inputs + + +{{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r""" + This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the + generic methods the library implements for all its model (such as downloading or saving, resizing the input + embeddings, pruning heads etc.) + + This model is also a `tf.keras.Model `__ subclass. Use + it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage + and behavior. + + .. note:: + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all + the tensors in the first argument of the model call function: :obj:`model(inputs)`. + + If you choose this second option, there are three possibilities you can use to gather all the input Tensors in + the first positional argument : + + - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Args: + config (:class:`~transformers.{{cookiecutter.camelcase_modelname}}Config`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.TFPreTrainedModel.from_pretrained` method to load the + model weights. +""" + +{{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`tf.Tensor` of shape :obj:`({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~transformers.BertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`): + Provide for translation and summarization training. By default, the model will create this tensor by + shifting the input_ids right, following the paper. + decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`): + will be made by default and ignore pad tokens. It is not recommended to set this for most use cases. + encoder_outputs (:obj:`tf.FloatTensor`, `optional`): + hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + of shape :obj:`(batch_size, sequence_length, hidden_size)` is a sequence of + past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). Set to :obj:`False` during training, :obj:`True` during generation + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.TFModelOutput` instead of a plain tuple. + training (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@keras_serializable +class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): + config_class = {{cookiecutter.camelcase_modelname}}Config + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + :class:`TF{{cookiecutter.camelcase_modelname}}EncoderLayer`. + + Args: + config: {{cookiecutter.camelcase_modelname}}Config + """ + + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[TFSharedEmbeddings] = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.dropout = tf.keras.layers.Dropout(config.dropout) + self.layerdrop = config.encoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + + self.embed_tokens = embed_tokens + self.embed_positions = TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + self.padding_idx, + name="embed_positions", + ) + self.layers = [TF{{cookiecutter.camelcase_modelname}}EncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] + self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + + def call( + self, + input_ids=None, + inputs_embeds=None, + attention_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + **kwargs, + ): + """ + Args: + input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` + for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded + representation. This is useful if you want more control over how to convert :obj:`input_ids` indices + into associated vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif inputs["input_ids"] is not None: + input_shape = shape_list(inputs["input_ids"]) + elif inputs["inputs_embeds"] is not None: + input_shape = shape_list(inputs["inputs_embeds"])[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs["inputs_embeds"] is None: + inputs_embeds = self.embed_tokens(inputs["input_ids"]) + else: + inputs_embeds = inputs["inputs_embeds"] + + inputs_embeds = inputs_embeds + + embed_pos = self.embed_positions(input_shape) + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = self.dropout(hidden_states, training=inputs["training"]) + + # check attention mask and invert + if inputs["attention_mask"] is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(inputs["attention_mask"]) + else: + attention_mask = None + + encoder_states = () if inputs["output_hidden_states"] else None + all_attentions = () if inputs["output_attentions"] else None + + # encoder layers + for encoder_layer in self.layers: + + if inputs["output_hidden_states"]: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if inputs["training"] and (dropout_probability < self.layerdrop): # skip the layer + continue + + hidden_states, attn = encoder_layer(hidden_states, attention_mask) + + if inputs["output_attentions"]: + all_attentions += (attn,) + + if inputs["output_hidden_states"]: + encoder_states = encoder_states + (hidden_states,) + + if not inputs["return_dict"]: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +@keras_serializable +class TF{{cookiecutter.camelcase_modelname}}Decoder(tf.keras.layers.Layer): + config_class = {{cookiecutter.camelcase_modelname}}Config + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`TF{{cookiecutter.camelcase_modelname}}DecoderLayer` + + Args: + config: {{cookiecutter.camelcase_modelname}}Config + embed_tokens: output embedding + """ + + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[TFSharedEmbeddings] = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.padding_idx = config.pad_token_id + self.embed_tokens = embed_tokens + self.layerdrop = config.decoder_layerdrop + self.embed_positions = TF{{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + self.padding_idx, + name="embed_positions", + ) + self.layers = [TF{{cookiecutter.camelcase_modelname}}DecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] + self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + + self.dropout = tf.keras.layers.Dropout(config.dropout) + + def call( + self, + input_ids=None, + inputs_embeds=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + **kwargs, + ): + r""" + Args: + input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` + for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + encoder_hidden_states (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + past_key_values (:obj:`Tuple[Tuple[tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up + decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last + :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of + shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, + sequence_length)`. + inputs_embeds (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded + representation. This is useful if you want more control over how to convert :obj:`input_ids` indices + into associated vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif inputs["input_ids"] is not None: + input_shape = shape_list(inputs["input_ids"]) + elif inputs["inputs_embeds"] is not None: + input_shape = shape_list(inputs["inputs_embeds"])[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + past_key_values_length = ( + inputs["past_key_values"][0][0].shape[2] if inputs["past_key_values"] is not None else 0 + ) + + # embed positions + positions = self.embed_positions(input_shape, past_key_values_length) + + if inputs["inputs_embeds"] is None: + inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"]) + + hidden_states = inputs["inputs_embeds"] + + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length) + else: + combined_attention_mask = _expand_mask( + tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1] + ) + + if inputs["attention_mask"] is None and inputs["input_ids"] is not None and input_shape[-1] > 1: + inputs["attention_mask"] = tf.cast( + tf.math.not_equal(inputs["input_ids"], self.config.pad_token_id), inputs["input_ids"].dtype + ) + inputs["attention_mask"] = tf.concat( + [ + tf.ones((input_shape[0], past_key_values_length), dtype=inputs["attention_mask"].dtype), + inputs["attention_mask"], + ], + axis=-1, + ) + else: + inputs["attention_mask"] = tf.ones( + (input_shape[0], input_shape[1] + past_key_values_length), dtype=tf.int32 + ) + + if inputs["encoder_hidden_states"] is not None and inputs["encoder_attention_mask"] is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + inputs["encoder_attention_mask"] = _expand_mask(inputs["encoder_attention_mask"], tgt_len=input_shape[-1]) + + hidden_states = self.layernorm_embedding(hidden_states + positions) + hidden_states = self.dropout(hidden_states, training=inputs["training"]) + + # decoder layers + all_hidden_states = () + all_self_attns = () + present_key_values = () + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if inputs["output_hidden_states"]: + all_hidden_states += (hidden_states,) + dropout_probability = random.uniform(0, 1) + + if inputs["training"] and (dropout_probability < self.layerdrop): + continue + + past_key_value = inputs["past_key_values"][idx] if inputs["past_key_values"] is not None else None + + hidden_states, layer_self_attn, present_key_value = decoder_layer( + hidden_states, + attention_mask=combined_attention_mask, + encoder_hidden_states=inputs["encoder_hidden_states"], + encoder_attention_mask=inputs["encoder_attention_mask"], + past_key_value=past_key_value, + ) + + if inputs["use_cache"]: + present_key_values += (present_key_value,) + + if inputs["output_attentions"]: + all_self_attns += (layer_self_attn,) + + if inputs["output_hidden_states"]: + all_hidden_states += (hidden_states,) + else: + all_hidden_states = None + + all_self_attns = list(all_self_attns) if inputs["output_attentions"] else None + + present_key_values = (encoder_hidden_states, present_key_values) if inputs["use_cache"] else None + + if not inputs["return_dict"]: + return hidden_states, present_key_values, all_hidden_states, all_self_attns + else: + return TFBaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=present_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +@add_start_docstrings( + "The bare {{cookiecutter.uppercase_modelname}} Model outputting raw hidden-states without any specific head on top.", + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, +) +@keras_serializable +class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel): + base_model_prefix = "model" + + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, config.pad_token_id, name="model.shared") + + with tf.compat.v1.variable_scope("model.shared") as shared_abs_scope_name: + pass + + # Wraps layer to avoid problems with weight restoring and ensuring we're in the correct TF scope. + embed_tokens = TFWrappedEmbeddings(self.shared, abs_scope_name=shared_abs_scope_name) + embed_tokens.vocab_size = self.shared.vocab_size + embed_tokens.hidden_size = self.shared.hidden_size + + self.encoder = TF{{cookiecutter.camelcase_modelname}}Encoder(config, embed_tokens, name="encoder") + self.decoder = TF{{cookiecutter.camelcase_modelname}}Decoder(config, embed_tokens, name="decoder") + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="{{cookiecutter.checkpoint_identifier}}", + output_type=TFSeq2SeqModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None, + past_key_values=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + **kwargs + ): + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if inputs["decoder_input_ids"] is None and inputs["decoder_inputs_embeds"] is None: + inputs["use_cache"] = False + + inputs["output_hidden_states"] = ( + inputs["output_hidden_states"] + if inputs["output_hidden_states"] is not None + else self.config.output_hidden_states + ) + + if inputs["encoder_outputs"] is None: + inputs["encoder_outputs"] = self.encoder( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a TFBaseModelOutput when return_dict=True + elif inputs["return_dict"] and not isinstance(inputs["encoder_outputs"], TFBaseModelOutput): + inputs["encoder_outputs"] = TFBaseModelOutput( + last_hidden_state=inputs["encoder_outputs"][0], + hidden_states=inputs["encoder_outputs"][1] if len(inputs["encoder_outputs"]) > 1 else None, + attentions=inputs["encoder_outputs"][2] if len(inputs["encoder_outputs"]) > 2 else None, + ) + # If the user passed a TFBaseModelOutput for encoder_outputs, we wrap it in a tuple when return_dict=False + elif not inputs["return_dict"] and not isinstance(inputs["encoder_outputs"], tuple): + inputs["encoder_outputs"] = inputs["encoder_outputs"].to_tuple() + + decoder_outputs = self.decoder( + inputs["decoder_input_ids"], + attention_mask=inputs["decoder_attention_mask"], + encoder_hidden_states=inputs["encoder_outputs"][0], + encoder_attention_mask=inputs["attention_mask"], + past_key_values=inputs["past_key_values"], + inputs_embeds=inputs["decoder_inputs_embeds"], + use_cache=inputs["use_cache"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + if not inputs["return_dict"]: + return decoder_outputs + inputs["encoder_outputs"] + + return TFSeq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + encoder_last_hidden_state=inputs["encoder_outputs"].last_hidden_state, + encoder_hidden_states=inputs["encoder_outputs"].hidden_states, + encoder_attentions=inputs["encoder_outputs"].attentions, + ) + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + + def get_output_embeddings(self): + return self.shared + + +@add_start_docstrings( + "The {{cookiecutter.uppercase_modelname}} Model with a language modeling head. Can be used for summarization.", + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, +) +class TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [ + r"model.encoder.embed_tokens.weight", + r"model.decoder.embed_tokens.weight", + ] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.model = TF{{cookiecutter.camelcase_modelname}}Model(config, name="model") + self.use_cache = config.use_cache + # final_bias_logits is registered as a buffer in pytorch, so not trainable for the the sake of consistency. + self.final_logits_bias = self.add_weight( + name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False + ) + + def get_decoder(self): + return self.model.decoder + + def resize_token_embeddings(self, new_num_tokens): + super().resize_token_embeddings(new_num_tokens=new_num_tokens) + + # {{cookiecutter.uppercase_modelname}} is a special case where the bias has two dimensions + # and not named just `bias` + if new_num_tokens is not None: + num_tokens_to_copy = min(self.final_logits_bias.shape[0], new_num_tokens) + init_bias = tf.zeros((new_num_tokens,)) + init_bias[:num_tokens_to_copy] = self.final_logits_bias.value()[:num_tokens_to_copy] + self.final_logits_bias = self.add_weight( + shape=(1, new_num_tokens), + initializer="zeros", + trainable=False, + name="final_logits_bias", + ) + self.final_logits_bias.assign(init_bias) + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + encoder_outputs: Optional[TFBaseModelOutput] = None, + past_key_values=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + labels=None, + training=False, + **kwargs, + ): + """ + Returns: + + Examples:: + + >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration + >>> import tensorflow as tf + >>> mname = '{{cookiecutter.checkpoint_identifier}}' + >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained(mname) + >>> TXT = "My friends are but they eat too many carbs." + >>> model = TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained(mname) + >>> batch = tokenizer([TXT], return_tensors='tf') + >>> logits = model(inputs=batch.input_ids).logits + >>> probs = tf.nn.softmax(logits[0]) + >>> # probs[5] is associated with the mask token + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + + if inputs["labels"] is not None: + inputs["use_cache"] = False + if inputs["decoder_input_ids"] is None: + inputs["decoder_input_ids"] = shift_tokens_right( + inputs["labels"], self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + decoder_input_ids=inputs["decoder_input_ids"], + encoder_outputs=inputs["encoder_outputs"], + decoder_attention_mask=inputs["decoder_attention_mask"], + past_key_values=inputs["past_key_values"], + inputs_embeds=inputs["inputs_embeds"], + decoder_inputs_embeds=inputs["decoder_inputs_embeds"], + use_cache=inputs["use_cache"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"] + ) + lm_logits = self.model.shared(outputs[0], mode="linear") + lm_logits = lm_logits + self.final_logits_bias + masked_lm_loss = None if inputs["labels"] is None else self.compute_loss(inputs["labels"], lm_logits) + + if not inputs["return_dict"]: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + return TFSeq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, # index 1 of d outputs + decoder_hidden_states=outputs.decoder_hidden_states, # index 2 of d outputs + decoder_attentions=outputs.decoder_attentions, # index 3 of d outputs + encoder_last_hidden_state=outputs.last_hidden_state, # index 0 of encoder outputs + encoder_hidden_states=outputs.encoder_hidden_states, # 1 of e out + encoder_attentions=outputs.encoder_attentions, # 2 of e out + ) + + def prepare_inputs_for_generation(self, decoder_input_ids, past, attention_mask, use_cache, **kwargs) -> Dict: + assert past is not None and len(past) in {1, 2}, f"past has to be an iterable of length 1,2 got {past}" + if len(past) == 1: + assert isinstance(past[0], tf.Tensor), f"`past[0]` has to be of type `tf.Tensor`, but is {type(past[0])}" + encoder_outputs = TFBaseModelOutput(last_hidden_state=past[0]) + past_key_values = None + else: + assert ( + len(past) == 2 + ), "`past` has to be of length 2 with the encoder_outputs at the first position and past_key_values at the second position." + encoder_outputs, past_key_values = past + if isinstance(encoder_outputs, tuple): + assert isinstance( + encoder_outputs[0], tf.Tensor + ), f"`encoder_outputs[0]` has to be of type `tf.Tensor`, but is {type(encoder_outputs[0])}" + encoder_outputs = TFBaseModelOutput(last_hidden_state=encoder_outputs[0]) + elif isinstance(encoder_outputs, tf.Tensor): + encoder_outputs = TFBaseModelOutput(last_hidden_state=encoder_outputs) + assert ( + past_key_values + ), f"decoder cached states must be truthy. got {past_key_values} from the 2nd element of past" + decoder_input_ids = decoder_input_ids[:, -1:] + + assert isinstance( + encoder_outputs, TFBaseModelOutput + ), f"encoder_outputs should be a TFBaseModelOutput, Instead got {type(encoder_outputs)}." + return { + "input_ids": None, # encoder_outputs is defined. input_ids not needed + "encoder_outputs": encoder_outputs, + "past_key_values": past_key_values, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "use_cache": use_cache, # change this to avoid caching (presumably for debugging) + } + + @staticmethod + def _reorder_cache(past, beam_idx): + if len(past) == 1: + return past + + past_key_values = past[1] + + reordered_past = () + for layer_past_key_values in past_key_values: + reordered_past += ( + tuple(tf.gather(layer_past_key_value, beam_idx) for layer_past_key_value in layer_past_key_values), + ) + return (past[0], reordered_past) + + def get_output_embeddings(self): + return self.model.shared + + def get_encoder(self): + return self.model.encoder + + def compute_loss(self, labels, logits): + """CrossEntropyLoss that ignores pad tokens""" + loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True, + reduction=tf.keras.losses.Reduction.NONE, + ) + melted_labels = tf.reshape(labels, (-1,)) + active_loss = tf.not_equal(melted_labels, self.config.pad_token_id) + reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) + labels = tf.boolean_mask(melted_labels, active_loss) + return loss_fn(labels, reduced_logits) +{% endif -%} diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py index a810670a43..063e39fe5f 100755 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py @@ -14,6 +14,7 @@ # limitations under the License. """ PyTorch {{cookiecutter.modelname}} model. """ +{% if cookiecutter.is_encoder_decoder_model == "False" %} import math @@ -1391,3 +1392,1359 @@ class {{cookiecutter.camelcase_modelname}}ForQuestionAnswering({{cookiecutter.ca hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) +{% else %} +import random +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...file_utils import ( + add_code_sample_docstrings, + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import logging +from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config" +_TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer" + + +{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "{{cookiecutter.checkpoint_identifier}}", + # See all {{cookiecutter.modelname}} models at https://huggingface.co/models?filter={{cookiecutter.lowercase_modelname}} +] + + +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), float("-inf")) + mask_cond = torch.arange(mask.size(-1)) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +def _expand_mask( + mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None +): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) + + +def {{cookiecutter.camelcase_modelname}}LayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True): + if torch.cuda.is_available(): + try: + from apex.normalization import FusedLayerNorm + + return FusedLayerNorm(normalized_shape, eps, elementwise_affine) + except ImportError: + pass + return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine) + + +class {{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int): + assert padding_idx is not None, "`padding_idx` should not be None, but of type int" + num_embeddings + super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx) + + def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0): + """`input_ids_shape` is expected to be [bsz x seqlen].""" + bsz, seq_len = input_ids_shape[:2] + positions = torch.arange( + past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device + ) + return super().forward(positions) + + +class {{cookiecutter.camelcase_modelname}}Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert ( + self.head_dim * num_heads == self.embed_dim + ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})." + self.scaling = self.head_dim ** -0.5 + self.is_decoder = is_decoder + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, embed_dim = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + assert attn_weights.size() == ( + bsz * self.num_heads, + tgt_len, + src_len, + ), f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + + if attention_mask is not None: + assert attention_mask.size() == ( + bsz, + 1, + tgt_len, + src_len, + ), f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = F.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit akward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + assert attn_output.size() == ( + bsz * self.num_heads, + tgt_len, + self.head_dim, + ), f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}" + + attn_output = ( + attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + .transpose(1, 2) + .reshape(bsz, tgt_len, embed_dim) + ) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +class {{cookiecutter.camelcase_modelname}}EncoderLayer(nn.Module): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = {{cookiecutter.camelcase_modelname}}Attention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + ) + self.self_attn_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim) + + def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False): + """ + Args: + hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (:obj:`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (:obj:`bool`): Whether the base model outputs attentions. + This requires the attentions tensor to be reshaped in this function. + """ + residual = hidden_states + hidden_states, attn_weights, _ = self.self_attn( + hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + return hidden_states, attn_weights + + +class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = {{cookiecutter.camelcase_modelname}}Attention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim) + self.encoder_attn = {{cookiecutter.camelcase_modelname}}Attention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.encoder_attn_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[torch.Tensor] = False, + ): + """ + Args: + hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (:obj:`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states + output_attentions (:obj:`bool`): Whether the base model outputs attentions. + This requires the attentions tensor to be reshaped in this function. + """ + residual = hidden_states + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + return ( + hidden_states, + self_attn_weights, + present_key_value, + cross_attn_weights, + ) + + +# Copied from transformers.models.bart.modeling_bart.BartClassificationHead with Bart->{{cookiecutter.camelcase_modelname}} +class {{cookiecutter.camelcase_modelname}}ClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__( + self, + input_dim: int, + inner_dim: int, + num_classes: int, + pooler_dropout: float, + ): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, hidden_states: torch.Tensor): + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +class {{cookiecutter.camelcase_modelname}}PreTrainedModel(PreTrainedModel): + config_class = {{cookiecutter.camelcase_modelname}}Config + base_model_prefix = "model" + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + @property + def dummy_inputs(self): + pad_token = self.config.pad_token_id + input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) + dummy_inputs = { + "attention_mask": input_ids.ne(pad_token), + "input_ids": input_ids, + } + return dummy_inputs + + +{{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r""" + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Parameters: + config (:class:`~transformers.{{cookiecutter.camelcase_modelname}}Config`): + Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. +""" + +{{cookiecutter.uppercase_modelname}}_GENERATION_EXAMPLE = r""" + Summarization example:: + + >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, {{cookiecutter.camelcase_modelname}}Config + + >>> model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}') + >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}') + + >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt') + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True) + >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]) +""" + +{{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`): + Provide for translation and summarization training. By default, the model will create this tensor by + shifting the :obj:`input_ids` to the right, following the paper. + decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`): + Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will + also be used by default. + + If you want to change padding behavior, you should read :func:`modeling_{{cookiecutter.lowercase_modelname}}._prepare_decoder_inputs` and + modify to your needs. See diagram 1 in `the paper `__ for more + information on the default strategy. + encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`): + Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`: + :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, + `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the + cross-attention of the decoder. + past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`. + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded + representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds` + have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert + :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + + If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds` + takes the value of :obj:`inputs_embeds`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" + + +class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_modelname}}PreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + :class:`{{cookiecutter.camelcase_modelname}}EncoderLayer`. + + Args: + config: {{cookiecutter.camelcase_modelname}}Config + embed_tokens (torch.nn.Embedding): output embedding + """ + + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + + if embed_tokens is not None: + self.embed_tokens = embed_tokens + else: + self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx) + + self.embed_positions = {{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + self.padding_idx, + ) + self.layers = nn.ModuleList([{{cookiecutter.camelcase_modelname}}EncoderLayer(config) for _ in range(config.encoder_layers)]) + self.layernorm_embedding = {{cookiecutter.camelcase_modelname}}LayerNorm(embed_dim) + + self.init_weights() + + def forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` + for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded + representation. This is useful if you want more control over how to convert :obj:`input_ids` indices + into associated vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + embed_pos = self.embed_positions(input_shape) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for encoder_layer in self.layers: + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if self.training and (dropout_probability < self.layerdrop): # skip the layer + attn = None + else: + hidden_states, attn = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions) + + if output_attentions: + all_attentions = all_attentions + (attn,) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class {{cookiecutter.camelcase_modelname}}Decoder({{cookiecutter.camelcase_modelname}}PreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`{{cookiecutter.camelcase_modelname}}DecoderLayer` + + Args: + config: {{cookiecutter.camelcase_modelname}}Config + embed_tokens (torch.nn.Embedding): output embedding + """ + + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + + if embed_tokens is not None: + self.embed_tokens = embed_tokens + else: + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx) + + self.embed_positions = {{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + self.padding_idx, + ) + self.layers = nn.ModuleList([{{cookiecutter.camelcase_modelname}}DecoderLayer(config) for _ in range(config.decoder_layers)]) + self.layernorm_embedding = {{cookiecutter.camelcase_modelname}}LayerNorm(config.d_model) + + self.init_weights() + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` + for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up + decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last + :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of + shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, + sequence_length)`. + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded + representation. This is useful if you want more control over how to convert :obj:`input_ids` indices + into associated vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length + ).to(self.device) + + if attention_mask is not None and combined_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = combined_attention_mask + _expand_mask( + attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) + + # embed positions + positions = self.embed_positions(input_shape, past_key_values_length) + + hidden_states = inputs_embeds + positions + hidden_states = self.layernorm_embedding(hidden_states) + + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + next_decoder_cache = () if use_cache else None + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + dropout_probability = random.uniform(0, 1) + if self.training and (dropout_probability < self.layerdrop): + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + hidden_states, layer_self_attn, present_key_value, layer_cross_attn = decoder_layer( + hidden_states, + attention_mask=combined_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + ) + + if use_cache: + next_decoder_cache += (present_key_value,) + + if output_attentions: + all_self_attns += (layer_self_attn,) + all_cross_attentions += (layer_cross_attn,) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + "The bare {{cookiecutter.modelname}} Model outputting raw hidden-states without any specific head on top.", + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, +) +class {{cookiecutter.camelcase_modelname}}Model({{cookiecutter.camelcase_modelname}}PreTrainedModel): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config): + super().__init__(config) + + padding_idx, vocab_size = config.pad_token_id, config.vocab_size + self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx) + + self.encoder = {{cookiecutter.camelcase_modelname}}Encoder(config, self.shared) + self.decoder = {{cookiecutter.camelcase_modelname}}Decoder(config, self.shared) + + self.init_weights() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="{{cookiecutter.checkpoint_identifier}}", + output_type=Seq2SeqModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + encoder_outputs=None, + past_key_values=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + "The {{cookiecutter.modelname}} Model with a language modeling head. Can be used for summarization.", {{cookiecutter.uppercase_modelname}}_START_DOCSTRING +) +class {{cookiecutter.camelcase_modelname}}ForConditionalGeneration({{cookiecutter.camelcase_modelname}}PreTrainedModel): + base_model_prefix = "model" + _keys_to_ignore_on_load_missing = [ + r"final_logits_bias", + r"encoder\.version", + r"decoder\.version", + r"lm_head\.weight", + ] + + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config): + super().__init__(config) + self.model = {{cookiecutter.camelcase_modelname}}Model(config) + self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + self.init_weights() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding: + new_embeddings = super().resize_token_embeddings(new_num_tokens) + self._resize_final_logits_bias(new_num_tokens) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int) -> None: + old_num_tokens = self.final_logits_bias.shape[-1] + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings({{cookiecutter.uppercase_modelname}}_GENERATION_EXAMPLE) + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + encoder_outputs=None, + past_key_values=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should either be in ``[0, ..., + config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``. + + Returns: + + Conditional generation example:: + + >>> from transformers import {{cookiecutter.camelcase_modelname}}Tokenizer, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration + >>> tokenizer = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}') + >>> TXT = "My friends are but they eat too many carbs." + + >>> model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}') + >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids'] + >>> logits = model(input_ids).logits + + >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + >>> probs = logits[0, masked_index].softmax(dim=0) + >>> values, predictions = probs.topk(5) + + >>> tokenizer.decode(predictions).split() + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if decoder_input_ids is None: + decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_inputs_for_generation( + self, decoder_input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs + ): + # cut decoder_input_ids if past is used + if past is not None: + decoder_input_ids = decoder_input_ids[:, -1:] + + return { + "input_ids": None, # encoder_outputs is defined. input_ids not needed + "encoder_outputs": encoder_outputs, + "past_key_values": past, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "use_cache": use_cache, # change this to avoid caching (presumably for debugging) + } + + @staticmethod + def _reorder_cache(past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + +@add_start_docstrings( + """ + {{cookiecutter.camelcase_modelname}} model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE + tasks. + """, + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, +) +class {{cookiecutter.camelcase_modelname}}ForSequenceClassification({{cookiecutter.camelcase_modelname}}PreTrainedModel): + def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, **kwargs): + super().__init__(config, **kwargs) + self.model = {{cookiecutter.camelcase_modelname}}Model(config) + self.classification_head = {{cookiecutter.camelcase_modelname}}ClassificationHead( + config.d_model, + config.d_model, + config.num_labels, + config.classifier_dropout, + ) + self.model._init_weights(self.classification_head.dense) + self.model._init_weights(self.classification_head.out_proj) + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="{{cookiecutter.checkpoint_identifier}}", + output_type=Seq2SeqSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + if input_ids is None and inputs_embeds is not None: + raise NotImplementedError( + f"Passing input embeddings is currently not supported for {self.__class__.__name__}" + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] # last hidden state + + eos_mask = input_ids.eq(self.config.eos_token_id) + + if len(torch.unique(eos_mask.sum(1))) > 1: + raise ValueError("All examples must have the same number of tokens.") + sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[ + :, -1, : + ] + logits = self.classification_head(sentence_representation) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return Seq2SeqSequenceClassifierOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + """ + {{cookiecutter.modelname}} Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, +) +class {{cookiecutter.camelcase_modelname}}ForQuestionAnswering({{cookiecutter.camelcase_modelname}}PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + config.num_labels = 2 + self.num_labels = config.num_labels + + self.model = {{cookiecutter.camelcase_modelname}}Model(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.model._init_weights(self.qa_outputs) + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="{{cookiecutter.checkpoint_identifier}}", + output_type=Seq2SeqQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + encoder_outputs=None, + start_positions=None, + end_positions=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if start_positions is not None and end_positions is not None: + use_cache = False + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = ( + start_logits, + end_logits, + ) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return Seq2SeqQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) +{% endif -%} diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 2789377dea..0a8f983eb2 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +{% if cookiecutter.is_encoder_decoder_model == "False" %} import unittest @@ -318,3 +319,272 @@ class TF{{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCa ] ) tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4) + +{% else %} +import unittest + +from transformers import {{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}Tokenizer, is_tf_available +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_tf, slow + +from .test_configuration_common import ConfigTester +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor + + +if is_tf_available(): + import tensorflow as tf + + from transformers import TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration, TF{{cookiecutter.camelcase_modelname}}Model + + +@require_tf +class TF{{cookiecutter.camelcase_modelname}}ModelTester: + config_cls = {{cookiecutter.camelcase_modelname}}Config + config_updates = {} + hidden_act = "gelu" + + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_labels=False, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=20, + eos_token_id=2, + pad_token_id=1, + bos_token_id=0, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + + def prepare_config_and_inputs_for_common(self): + input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size) + eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1) + input_ids = tf.concat([input_ids, eos_tensor], axis=1) + + decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + config = self.config_cls( + vocab_size=self.vocab_size, + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + eos_token_ids=[2], + bos_token_id=self.bos_token_id, + pad_token_id=self.pad_token_id, + decoder_start_token_id=self.pad_token_id, + **self.config_updates, + ) + inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(config, input_ids, decoder_input_ids) + return config, inputs_dict + + def check_decoder_model_past_large_inputs(self, config, inputs_dict): + model = TF{{cookiecutter.camelcase_modelname}}Model(config=config).get_decoder() + input_ids = inputs_dict["input_ids"] + + input_ids = input_ids[:1, :] + attention_mask = inputs_dict["attention_mask"][:1, :] + self.batch_size = 1 + + # first forward pass + outputs = model(input_ids, attention_mask=attention_mask, use_cache=True) + + output, past_key_values = outputs.to_tuple() + past_key_values = past_key_values[1] + + # create hypothetical next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8) + + # append to next input_ids and + next_input_ids = tf.concat([input_ids, next_tokens], axis=-1) + next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1) + + output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0] + output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0] + + self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1]) + + # select random slice + random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1])) + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx] + output_from_past_slice = output_from_past[:, :, random_slice_idx] + + # test that outputs are equal for slice + tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3) + + +def prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict( + config, + input_ids, + decoder_input_ids, + attention_mask=None, + decoder_attention_mask=None, +): + if attention_mask is None: + attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8) + if decoder_attention_mask is None: + decoder_attention_mask = tf.cast(tf.math.not_equal(decoder_input_ids, config.pad_token_id), tf.int8) + return { + "input_ids": input_ids, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "decoder_attention_mask": decoder_attention_mask, + } + + +@require_tf +class TF{{cookiecutter.camelcase_modelname}}ModelTest(TFModelTesterMixin, unittest.TestCase): + all_model_classes = (TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration, TF{{cookiecutter.camelcase_modelname}}Model) if is_tf_available() else () + all_generative_model_classes = (TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,) if is_tf_available() else () + is_encoder_decoder = True + test_pruning = False + + def setUp(self): + self.model_tester = TF{{cookiecutter.camelcase_modelname}}ModelTester(self) + self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_decoder_model_past_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() + self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs) + + def test_model_common_attributes(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer) + x = model.get_output_layer_with_bias() + assert x is None + name = model.get_prefix_bias_name() + assert name is None + + +def _assert_tensors_equal(a, b, atol=1e-12, prefix=""): + """If tensors not close, or a and b arent both tensors, raise a nice Assertion error.""" + if a is None and b is None: + return True + try: + if tf.debugging.assert_near(a, b, atol=atol): + return True + raise + except Exception: + msg = "{} != {}".format(a, b) + if prefix: + msg = prefix + ": " + msg + raise AssertionError(msg) + + +def _long_tensor(tok_lst): + return tf.constant(tok_lst, dtype=tf.int32) + + +TOLERANCE = 1e-4 + + +@slow +@require_sentencepiece +@require_tokenizers +@require_tf +class TF{{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCase): + def test_inference_no_head(self): + model = TF{{cookiecutter.camelcase_modelname}}Model.from_pretrained('{{cookiecutter.checkpoint_identifier}}') + # change to intended input here + input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids) + output = model(**inputs_dict)[0] + expected_shape = (1, 11, 1024) + self.assertEqual(output.shape, expected_shape) + # change to expected output here + expected_slice = tf.Tensor( + [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], + ) + self.assertTrue(tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE)) + + def test_inference_with_head(self): + model = TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}') + # change to intended input here + input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids) + output = model(**inputs_dict)[0] + expected_shape = (1, 11, 1024) + self.assertEqual(output.shape, expected_shape) + # change to expected output here + expected_slice = tf.Tensor( + [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], + ) + self.assertTrue(tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE)) + + def test_seq_to_seq_generation(self): + hf = TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}') + tok = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}') + + batch_input = [ + # string 1, + # string 2, + # string 3, + # string 4, + ] + + # The below article tests that we don't add any hypotheses outside of the top n_beams + dct = tok.batch_encode_plus( + batch_input, + max_length=512, + padding="max_length", + truncation_strategy="only_first", + truncation=True, + return_tensors="tf", + ) + + hypotheses_batch = hf.generate( + input_ids=dct["input_ids"], + attention_mask=dct["attention_mask"], + num_beams=2, + ) + + EXPECTED = [ + # here expected 1, + # here expected 2, + # here expected 3, + # here expected 4, + ] + + generated = tok.batch_decode( + hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True + ) + assert generated == EXPECTED +{%- endif %} diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py index d7cfeb3598..8f6ec18edc 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py @@ -15,6 +15,7 @@ """ Testing suite for the PyTorch {{cookiecutter.modelname}} model. """ +{% if cookiecutter.is_encoder_decoder_model == "False" -%} import unittest from tests.test_modeling_common import floats_tensor @@ -406,3 +407,395 @@ class {{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCase ) self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4)) + + +{% else -%} +import copy +import tempfile +import unittest + +import timeout_decorator # noqa + +from transformers import is_torch_available +from transformers.file_utils import cached_property +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device + +from .test_configuration_common import ConfigTester +from .test_generation_utils import GenerationTesterMixin +from .test_modeling_common import ModelTesterMixin, ids_tensor + + +if is_torch_available(): + import torch + + from transformers import ( + {{cookiecutter.camelcase_modelname}}Config, + {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, + {{cookiecutter.camelcase_modelname}}ForQuestionAnswering, + {{cookiecutter.camelcase_modelname}}ForSequenceClassification, + {{cookiecutter.camelcase_modelname}}Model, + {{cookiecutter.camelcase_modelname}}Tokenizer, + ) + from transformers.models.{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_modelname}} import ( + {{cookiecutter.camelcase_modelname}}Decoder, + {{cookiecutter.camelcase_modelname}}Encoder, + ) + + +def prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict( + config, + input_ids, + decoder_input_ids, + attention_mask=None, + decoder_attention_mask=None, +): + if attention_mask is None: + attention_mask = input_ids.ne(config.pad_token_id) + if decoder_attention_mask is None: + decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id) + return { + "input_ids": input_ids, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "decoder_attention_mask": attention_mask, + } + + +@require_torch +class {{cookiecutter.camelcase_modelname}}ModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_labels=False, + vocab_size=99, + hidden_size=16, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=20, + eos_token_id=2, + pad_token_id=1, + bos_token_id=0, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp( + 3, + ) + input_ids[:, -1] = self.eos_token_id # Eos Token + + decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + config = {{cookiecutter.camelcase_modelname}}Config( + vocab_size=self.vocab_size, + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + eos_token_id=self.eos_token_id, + bos_token_id=self.bos_token_id, + pad_token_id=self.pad_token_id, + ) + inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(config, input_ids, decoder_input_ids) + return config, inputs_dict + + def prepare_config_and_inputs_for_common(self): + config, inputs_dict = self.prepare_config_and_inputs() + return config, inputs_dict + + def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict): + model = {{cookiecutter.camelcase_modelname}}Model(config=config).get_decoder().to(torch_device).eval() + input_ids = inputs_dict["input_ids"] + attention_mask = inputs_dict["attention_mask"] + + # first forward pass + outputs = model(input_ids, attention_mask=attention_mask, use_cache=True) + + output, past_key_values = outputs.to_tuple() + + # create hypothetical multiple next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_attn_mask = ids_tensor((self.batch_size, 3), 2) + + # append to next input_ids and + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1) + + output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"] + output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)["last_hidden_state"] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2)) + + def check_encoder_decoder_model_standalone(self, config, inputs_dict): + model = {{cookiecutter.camelcase_modelname}}Model(config=config).to(torch_device).eval() + outputs = model(**inputs_dict) + + encoder_last_hidden_state = outputs.encoder_last_hidden_state + last_hidden_state = outputs.last_hidden_state + + with tempfile.TemporaryDirectory() as tmpdirname: + encoder = model.get_encoder() + encoder.save_pretrained(tmpdirname) + encoder = {{cookiecutter.camelcase_modelname}}Encoder.from_pretrained(tmpdirname).to(torch_device) + + encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[ + 0 + ] + + self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3) + + with tempfile.TemporaryDirectory() as tmpdirname: + decoder = model.get_decoder() + decoder.save_pretrained(tmpdirname) + decoder = {{cookiecutter.camelcase_modelname}}Decoder.from_pretrained(tmpdirname).to(torch_device) + + last_hidden_state_2 = decoder( + input_ids=inputs_dict["decoder_input_ids"], + attention_mask=inputs_dict["decoder_attention_mask"], + encoder_hidden_states=encoder_last_hidden_state, + encoder_attention_mask=inputs_dict["attention_mask"], + )[0] + + self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3) + + +@require_torch +class {{cookiecutter.camelcase_modelname}}ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + all_model_classes = ( + ({{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, {{cookiecutter.camelcase_modelname}}ForSequenceClassification, {{cookiecutter.camelcase_modelname}}ForQuestionAnswering) + if is_torch_available() + else () + ) + all_generative_model_classes = ({{cookiecutter.camelcase_modelname}}ForConditionalGeneration,) if is_torch_available() else () + is_encoder_decoder = True + test_pruning = False + test_head_masking = False + test_missing_keys = False + + def setUp(self): + self.model_tester = {{cookiecutter.camelcase_modelname}}ModelTester(self) + self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_initialization_more(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs() + model = {{cookiecutter.camelcase_modelname}}Model(config) + model.to(torch_device) + model.eval() + # test init + self.assertTrue((model.encoder.embed_tokens.weight == model.shared.weight).all().item()) + + def _check_var(module): + """Check that we initialized various parameters from N(0, config.init_std).""" + self.assertAlmostEqual(torch.std(module.weight).item(), config.init_std, 2) + + _check_var(model.encoder.embed_tokens) + _check_var(model.encoder.layers[0].self_attn.k_proj) + _check_var(model.encoder.layers[0].fc1) + _check_var(model.encoder.embed_positions) + + def test_save_load_strict(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs() + for model_class in self.all_model_classes: + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True) + self.assertEqual(info["missing_keys"], []) + + def test_decoder_model_past_with_large_inputs(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) + + def test_encoder_decoder_model_standalone(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() + self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs) + + # {{cookiecutter.camelcase_modelname}}ForSequenceClassification does not support inputs_embeds + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in ({{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, {{cookiecutter.camelcase_modelname}}ForQuestionAnswering): + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + + if not self.is_encoder_decoder: + input_ids = inputs["input_ids"] + del inputs["input_ids"] + else: + encoder_input_ids = inputs["input_ids"] + decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) + del inputs["input_ids"] + inputs.pop("decoder_input_ids", None) + + wte = model.get_input_embeddings() + if not self.is_encoder_decoder: + inputs["inputs_embeds"] = wte(input_ids) + else: + inputs["inputs_embeds"] = wte(encoder_input_ids) + inputs["decoder_inputs_embeds"] = wte(decoder_input_ids) + + with torch.no_grad(): + model(**inputs)[0] + + def test_generate_fp16(self): + config, input_dict = self.model_tester.prepare_config_and_inputs() + input_ids = input_dict["input_ids"] + attention_mask = input_ids.ne(1).to(torch_device) + model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration(config).eval().to(torch_device) + if torch_device == "cuda": + model.half() + model.generate(input_ids, attention_mask=attention_mask) + model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3) + + +def assert_tensors_close(a, b, atol=1e-12, prefix=""): + """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error.""" + if a is None and b is None: + return True + try: + if torch.allclose(a, b, atol=atol): + return True + raise + except Exception: + pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item() + if a.numel() > 100: + msg = f"tensor values are {pct_different:.1%} percent different." + else: + msg = f"{a} != {b}" + if prefix: + msg = prefix + ": " + msg + raise AssertionError(msg) + + +def _long_tensor(tok_lst): + return torch.tensor(tok_lst, dtype=torch.long, device=torch_device) + + +TOLERANCE = 1e-4 + + +@require_torch +@require_sentencepiece +@require_tokenizers +@slow +class {{cookiecutter.camelcase_modelname}}ModelIntegrationTests(unittest.TestCase): + @cached_property + def default_tokenizer(self): + return {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}') + + def test_inference_no_head(self): + model = {{cookiecutter.camelcase_modelname}}Model.from_pretrained('{{cookiecutter.checkpoint_identifier}}').to(torch_device) + input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + decoder_input_ids = _long_tensor([[2, 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588]]) + inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids) + with torch.no_grad(): + output = model(**inputs_dict)[0] + expected_shape = torch.Size((1, 11, 1024)) + self.assertEqual(output.shape, expected_shape) + # change to expected output here + expected_slice = torch.tensor( + [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device + ) + self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE)) + + def test_inference_head(self): + model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}').to(torch_device) + + # change to intended input + input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids) + with torch.no_grad(): + output = model(**inputs_dict)[0] + expected_shape = torch.Size((1, 11, model.config.vocab_size)) + self.assertEqual(output.shape, expected_shape) + # change to expected output here + expected_slice = torch.tensor( + [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device + ) + self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE)) + + def test_seq_to_seq_generation(self): + hf = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}').to(torch_device) + tok = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}') + + batch_input = [ + # string 1, + # string 2, + # string 3, + # string 4, + ] + + # The below article tests that we don't add any hypotheses outside of the top n_beams + dct = tok.batch_encode_plus( + batch_input, + max_length=512, + padding="max_length", + truncation_strategy="only_first", + truncation=True, + return_tensors="pt", + ) + + hypotheses_batch = hf.generate( + input_ids=dct["input_ids"].to(torch_device), + attention_mask=dct["attention_mask"].to(torch_device), + num_beams=2, + ) + + EXPECTED = [ + # here expected 1, + # here expected 2, + # here expected 3, + # here expected 4, + ] + + generated = tok.batch_decode( + hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True + ) + assert generated == EXPECTED +{% endif -%} diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py index fbd2044328..1ff23d30c2 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py @@ -28,6 +28,7 @@ # To replace in: "src/transformers/__init__.py" # Below: "if is_torch_available():" if generating PyTorch # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" %} from .models.{{cookiecutter.lowercase_modelname}} import ( {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST, {{cookiecutter.camelcase_modelname}}ForMaskedLM, @@ -41,10 +42,20 @@ {{cookiecutter.camelcase_modelname}}PreTrainedModel, load_tf_weights_in_{{cookiecutter.lowercase_modelname}}, ) +{% else %} + from .models.{{cookiecutter.lowercase_modelname}} import ( + {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST, + {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, + {{cookiecutter.camelcase_modelname}}ForQuestionAnswering, + {{cookiecutter.camelcase_modelname}}ForSequenceClassification, + {{cookiecutter.camelcase_modelname}}Model, + ) +{% endif -%} # End. # Below: "if is_tf_available():" if generating TensorFlow # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" %} from .models.{{cookiecutter.lowercase_modelname}} import ( TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST, TF{{cookiecutter.camelcase_modelname}}ForMaskedLM, @@ -57,11 +68,19 @@ TF{{cookiecutter.camelcase_modelname}}Model, TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, ) +{% else %} + from .models.{{cookiecutter.lowercase_modelname}} import ( + TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration, + TF{{cookiecutter.camelcase_modelname}}Model, + TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, + ) +{% endif -%} # End. # Below: "if is_tokenizers_available():" # Replace with: - from models.{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}TokenizerFast + from .models.{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}TokenizerFast +# End. # Below: "from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig" # Replace with: @@ -101,6 +120,7 @@ from ..{{cookiecutter.lowercase_modelname}}.configuration_{{cookiecutter.lowerca # Below: "# Add modeling imports here" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} from ..{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_modelname}} import ( {{cookiecutter.camelcase_modelname}}ForMaskedLM, {{cookiecutter.camelcase_modelname}}ForCausalLM, @@ -110,6 +130,14 @@ from ..{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_mo {{cookiecutter.camelcase_modelname}}ForTokenClassification, {{cookiecutter.camelcase_modelname}}Model, ) +{% else -%} +from ..{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_modelname}} import ( + {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, + {{cookiecutter.camelcase_modelname}}ForQuestionAnswering, + {{cookiecutter.camelcase_modelname}}ForSequenceClassification, + {{cookiecutter.camelcase_modelname}}Model, +) +{% endif -%} # End. # Below: "# Base model mapping" @@ -119,17 +147,27 @@ from ..{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_mo # Below: "# Model with LM heads mapping" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForMaskedLM), +{% else %} + ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration), +{% endif -%} # End. # Below: "# Model for Causal LM mapping" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForCausalLM), +{% else -%} +{% endif -%} # End. # Below: "# Model for Masked LM mapping" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForMaskedLM), +{% else -%} +{% endif -%} # End. # Below: "# Model for Sequence Classification mapping" @@ -144,14 +182,27 @@ from ..{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_mo # Below: "# Model for Token Classification mapping" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForTokenClassification), +{% else -%} +{% endif -%} # End. # Below: "# Model for Multiple Choice mapping" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForMultipleChoice), +{% else -%} +{% endif -%} # End. +# Below: "# Model for Seq2Seq Causal LM mapping" +# Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} +{% else %} + ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration), +{% endif -%} +# End. # To replace in: "src/transformers/models/auto/modeling_tf_auto.py" if generating TensorFlow # Below: "from .configuration_auto import (" @@ -161,6 +212,7 @@ from ..{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_mo # Below: "# Add modeling imports here" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} from ..{{cookiecutter.lowercase_modelname}}.modeling_tf_{{cookiecutter.lowercase_modelname}} import ( TF{{cookiecutter.camelcase_modelname}}ForMaskedLM, TF{{cookiecutter.camelcase_modelname}}ForCausalLM, @@ -170,6 +222,12 @@ from ..{{cookiecutter.lowercase_modelname}}.modeling_tf_{{cookiecutter.lowercase TF{{cookiecutter.camelcase_modelname}}ForTokenClassification, TF{{cookiecutter.camelcase_modelname}}Model, ) +{% else -%} +from ..{{cookiecutter.lowercase_modelname}}.modeling_tf_{{cookiecutter.lowercase_modelname}} import ( + TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration, + TF{{cookiecutter.camelcase_modelname}}Model, +) +{% endif -%} # End. # Below: "# Base model mapping" @@ -179,35 +237,65 @@ from ..{{cookiecutter.lowercase_modelname}}.modeling_tf_{{cookiecutter.lowercase # Below: "# Model with LM heads mapping" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForMaskedLM), +{% else %} + ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration), +{% endif -%} # End. # Below: "# Model for Causal LM mapping" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForCausalLM), +{% else -%} +{% endif -%} # End. # Below: "# Model for Masked LM mapping" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForMaskedLM), +{% else -%} +{% endif -%} # End. # Below: "# Model for Sequence Classification mapping" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification), +{% else -%} +{% endif -%} # End. # Below: "# Model for Question Answering mapping" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering), +{% else -%} +{% endif -%} # End. # Below: "# Model for Token Classification mapping" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForTokenClassification), +{% else -%} +{% endif -%} # End. # Below: "# Model for Multiple Choice mapping" # Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice), +{% else -%} +{% endif -%} +# End. + +# Below: "# Model for Seq2Seq Causal LM mapping" +# Replace with: +{% if cookiecutter.is_encoder_decoder_model == "False" -%} +{% else %} + ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration), +{% endif -%} # End. diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py index 295a9631f5..31ba6bd964 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py @@ -56,6 +56,47 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(BertTokenizerFast): max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer + +{%- elif cookiecutter.tokenizer_type == "Based on BART" %} +from ...utils import logging +from ..bart.tokenization_bart_fast import BartTokenizerFast +from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Tokenizer + + +logger = logging.get_logger(__name__) + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.json", + }, + "merges_file": { + "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/merges.txt", + }, + "tokenizer_file": { + "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "{{cookiecutter.checkpoint_identifier}}": 1024, +} + + +class {{cookiecutter.camelcase_modelname}}TokenizerFast(BartTokenizerFast): + r""" + Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs + end-to-end tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning + parameters. + """ + + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer + {%- elif cookiecutter.tokenizer_type == "Standalone" %} from typing import List, Optional diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py index 476c8df026..23374f4b68 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py @@ -54,6 +54,45 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(BertTokenizer): pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + +{%- elif cookiecutter.tokenizer_type == "Based on BART" %} +from ...utils import logging +from ..bart.tokenization_bart import BartTokenizer + + +logger = logging.get_logger(__name__) + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.json", + }, + "merges_file": { + "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/merges.txt", + }, + "tokenizer_file": { + "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "{{cookiecutter.checkpoint_identifier}}": 1024, +} + + +class {{cookiecutter.camelcase_modelname}}Tokenizer(BartTokenizer): + """ + Construct a {{cookiecutter.modelname}} tokenizer. + + :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end + tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning + parameters. + """ + + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + {%- elif cookiecutter.tokenizer_type == "Standalone" %} from typing import List, Optional @@ -289,5 +328,4 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast) return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - {% endif %} diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst index 61f2601ba3..02bb76ebbe 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst @@ -57,7 +57,7 @@ Tips: .. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}Model :members: forward - +{% if cookiecutter.is_encoder_decoder_model == "False" %} {{cookiecutter.camelcase_modelname}}ForCausalLM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -83,7 +83,7 @@ Tips: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForMultipleChoice - :members: + :members: forward {{cookiecutter.camelcase_modelname}}ForTokenClassification @@ -99,6 +99,29 @@ Tips: .. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForQuestionAnswering :members: forward +{%- else %} +{{cookiecutter.camelcase_modelname}}ForConditionalGeneration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForConditionalGeneration + :members: forward + + +{{cookiecutter.camelcase_modelname}}ForSequenceClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForSequenceClassification + :members: forward + + +{{cookiecutter.camelcase_modelname}}ForQuestionAnswering +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForQuestionAnswering + :members: forward + + +{% endif -%} {% endif -%} {% if "TensorFlow" in cookiecutter.generate_tensorflow_and_pytorch -%} @@ -108,7 +131,7 @@ TF{{cookiecutter.camelcase_modelname}}Model .. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}Model :members: call - +{% if cookiecutter.is_encoder_decoder_model == "False" %} TF{{cookiecutter.camelcase_modelname}}ForMaskedLM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -120,7 +143,7 @@ TF{{cookiecutter.camelcase_modelname}}ForCausalLM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForCausalLM - :members: forward + :members: call TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification @@ -151,4 +174,11 @@ TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering :members: call +{%- else %} +TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration + :members: call +{% endif -%} {% endif -%} diff --git a/templates/adding_a_new_model/cookiecutter.json b/templates/adding_a_new_model/cookiecutter.json index a5df424a2d..c3e07e6c3f 100644 --- a/templates/adding_a_new_model/cookiecutter.json +++ b/templates/adding_a_new_model/cookiecutter.json @@ -5,6 +5,7 @@ "camelcase_modelname": "BrandNewBert", "authors": "The HuggingFace Team", "checkpoint_identifier": "brand-new-bert-base-cased", - "tokenizer_type": ["Based on BERT", "Standalone"], - "generate_tensorflow_and_pytorch": ["PyTorch & TensorFlow", "PyTorch", "TensorFlow"] -} \ No newline at end of file + "tokenizer_type": ["Based on BERT", "Based on BART", "Standalone"], + "generate_tensorflow_and_pytorch": ["PyTorch & TensorFlow", "PyTorch", "TensorFlow"], + "is_encoder_decoder_model": ["True", "False"] +} diff --git a/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json b/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json index 4ff2722e01..8618cff452 100644 --- a/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json +++ b/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json @@ -6,5 +6,6 @@ "authors": "The HuggingFace Team", "checkpoint_identifier": "brand-new-bert-base-cased", "tokenizer_type": "Based on BERT", - "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow" + "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow", + "is_encoder_decoder_model": "False" } diff --git a/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json b/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json index b740bd4a20..b30d69c041 100644 --- a/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json +++ b/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json @@ -6,5 +6,6 @@ "authors": "The HuggingFace Team", "checkpoint_identifier": "brand-new-bert-base-cased", "tokenizer_type": "Based on BERT", - "generate_tensorflow_and_pytorch": "PyTorch" + "generate_tensorflow_and_pytorch": "PyTorch", + "is_encoder_decoder_model": "False" } diff --git a/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json b/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json new file mode 100644 index 0000000000..f297820b2d --- /dev/null +++ b/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json @@ -0,0 +1,11 @@ +{ + "modelname": "NewENCDEC", + "uppercase_modelname": "NEW_ENC_DEC", + "lowercase_modelname": "new_enc_dec", + "camelcase_modelname": "NewEncDec", + "authors": "The HuggingFace Team", + "checkpoint_identifier": "new-enc-dec-base", + "tokenizer_type": "Based on BART", + "generate_tensorflow_and_pytorch": "PyTorch", + "is_encoder_decoder_model": "True" +} diff --git a/templates/adding_a_new_model/tests/standalone.json b/templates/adding_a_new_model/tests/standalone.json index e0983eadb6..80b8cfd84c 100644 --- a/templates/adding_a_new_model/tests/standalone.json +++ b/templates/adding_a_new_model/tests/standalone.json @@ -6,5 +6,6 @@ "authors": "The HuggingFace Team", "checkpoint_identifier": "bi-brand-new-bert-base-cased", "tokenizer_type": "Standalone", - "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow" + "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow", + "is_encoder_decoder_model": "False" } diff --git a/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json b/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json index 077a48d774..d4f9b0df8a 100644 --- a/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json +++ b/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json @@ -6,5 +6,6 @@ "authors": "The HuggingFace Team", "checkpoint_identifier": "brand-new-bert-base-cased", "tokenizer_type": "Based on BERT", - "generate_tensorflow_and_pytorch": "TensorFlow" + "generate_tensorflow_and_pytorch": "TensorFlow", + "is_encoder_decoder_model": "False" } diff --git a/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json b/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json new file mode 100644 index 0000000000..c98bc6b4b6 --- /dev/null +++ b/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json @@ -0,0 +1,11 @@ +{ + "modelname": "NewTFENCDEC", + "uppercase_modelname": "NEW_TF_ENC_DEC", + "lowercase_modelname": "new_tf_enc_dec", + "camelcase_modelname": "NewTFEncDec", + "authors": "The HuggingFace Team", + "checkpoint_identifier": "new-tf-enc-dec-base", + "tokenizer_type": "Based on BART", + "generate_tensorflow_and_pytorch": "TensorFlow", + "is_encoder_decoder_model": "True" +} diff --git a/tests/test_modeling_tf_bart.py b/tests/test_modeling_tf_bart.py index d4dd5307db..5e3377db94 100644 --- a/tests/test_modeling_tf_bart.py +++ b/tests/test_modeling_tf_bart.py @@ -94,7 +94,7 @@ class TFBartModelTester: self.batch_size = 1 # first forward pass - outputs = model(input_ids, use_cache=True) + outputs = model(input_ids, attention_mask=attention_mask, use_cache=True) output, past_key_values = outputs.to_tuple() past_key_values = past_key_values[1]