[TFBart] Split TF-Bart (#9497)

* make templates ready

* make add_new_model_command_ready

* finish tf bart

* prepare tf mbart

* finish tf bart

* add tf mbart

* add marian

* prep pegasus

* add tf pegasus

* push blenderbot tf

* add blenderbot

* add blenderbot small

* clean-up

* make fix copy

* define blend bot tok

* fix

* up

* make style

* add to docs

* add copy statements

* overwrite changes

* improve

* fix docs

* finish

* fix last slow test

* fix missing git conflict line

* fix blenderbot

* up

* fix blenderbot small

* load changes

* finish copied from

* upload fix
This commit is contained in:
Patrick von Platen
2021-01-12 02:06:32 +01:00
committed by GitHub
parent 0ecbb69806
commit 7f28613213
39 changed files with 7883 additions and 605 deletions

View File

@@ -868,7 +868,10 @@ if is_tf_available():
"TFBertPreTrainedModel",
]
)
_import_structure["models.blenderbot"].append("TFBlenderbotForConditionalGeneration")
_import_structure["models.blenderbot"].extend(["TFBlenderbotForConditionalGeneration", "TFBlenderbotModel"])
_import_structure["models.blenderbot_small"].extend(
["TFBlenderbotSmallForConditionalGeneration", "TFBlenderbotSmallModel"]
)
_import_structure["models.camembert"].extend(
[
"TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -986,8 +989,8 @@ if is_tf_available():
"TFLxmertVisualFeatureEncoder",
]
)
_import_structure["models.marian"].append("TFMarianMTModel")
_import_structure["models.mbart"].append("TFMBartForConditionalGeneration")
_import_structure["models.marian"].extend(["TFMarianMTModel", "TFMarianModel"])
_import_structure["models.mbart"].extend(["TFMBartForConditionalGeneration", "TFMBartModel"])
_import_structure["models.mobilebert"].extend(
[
"TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1028,7 +1031,7 @@ if is_tf_available():
"TFOpenAIGPTPreTrainedModel",
]
)
_import_structure["models.pegasus"].append("TFPegasusForConditionalGeneration")
_import_structure["models.pegasus"].extend(["TFPegasusForConditionalGeneration", "TFPegasusModel"])
_import_structure["models.roberta"].extend(
[
"TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1855,7 +1858,8 @@ if TYPE_CHECKING:
TFBertModel,
TFBertPreTrainedModel,
)
from .models.blenderbot import TFBlenderbotForConditionalGeneration
from .models.blenderbot import TFBlenderbotForConditionalGeneration, TFBlenderbotModel
from .models.blenderbot_small import TFBlenderbotSmallForConditionalGeneration, TFBlenderbotSmallModel
from .models.camembert import (
TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFCamembertForMaskedLM,
@@ -1953,8 +1957,8 @@ if TYPE_CHECKING:
TFLxmertPreTrainedModel,
TFLxmertVisualFeatureEncoder,
)
from .models.marian import TFMarianMTModel
from .models.mbart import TFMBartForConditionalGeneration
from .models.marian import TFMarian, TFMarianMTModel
from .models.mbart import TFMBartForConditionalGeneration, TFMBartModel
from .models.mobilebert import (
TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFMobileBertForMaskedLM,
@@ -1989,7 +1993,7 @@ if TYPE_CHECKING:
TFOpenAIGPTModel,
TFOpenAIGPTPreTrainedModel,
)
from .models.pegasus import TFPegasusForConditionalGeneration
from .models.pegasus import TFPegasusForConditionalGeneration, TFPegasusModel
from .models.roberta import (
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
TFRobertaForMaskedLM,

View File

@@ -44,7 +44,11 @@ from ..bert.modeling_tf_bert import (
TFBertLMHeadModel,
TFBertModel,
)
from ..blenderbot.modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration
from ..blenderbot.modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration, TFBlenderbotModel
from ..blenderbot_small.modeling_tf_blenderbot_small import (
TFBlenderbotSmallForConditionalGeneration,
TFBlenderbotSmallModel,
)
from ..camembert.modeling_tf_camembert import (
TFCamembertForMaskedLM,
TFCamembertForMultipleChoice,
@@ -100,8 +104,8 @@ from ..longformer.modeling_tf_longformer import (
TFLongformerModel,
)
from ..lxmert.modeling_tf_lxmert import TFLxmertForPreTraining, TFLxmertModel
from ..marian.modeling_tf_marian import TFMarianMTModel
from ..mbart.modeling_tf_mbart import TFMBartForConditionalGeneration
from ..marian.modeling_tf_marian import TFMarianModel, TFMarianMTModel
from ..mbart.modeling_tf_mbart import TFMBartForConditionalGeneration, TFMBartModel
from ..mobilebert.modeling_tf_mobilebert import (
TFMobileBertForMaskedLM,
TFMobileBertForMultipleChoice,
@@ -122,7 +126,7 @@ from ..mpnet.modeling_tf_mpnet import (
)
from ..mt5.modeling_tf_mt5 import TFMT5ForConditionalGeneration, TFMT5Model
from ..openai.modeling_tf_openai import TFOpenAIGPTForSequenceClassification, TFOpenAIGPTLMHeadModel, TFOpenAIGPTModel
from ..pegasus.modeling_tf_pegasus import TFPegasusForConditionalGeneration
from ..pegasus.modeling_tf_pegasus import TFPegasusForConditionalGeneration, TFPegasusModel
from ..roberta.modeling_tf_roberta import (
TFRobertaForMaskedLM,
TFRobertaForMultipleChoice,
@@ -167,6 +171,7 @@ from .configuration_auto import (
BartConfig,
BertConfig,
BlenderbotConfig,
BlenderbotSmallConfig,
CamembertConfig,
CTRLConfig,
DistilBertConfig,
@@ -225,6 +230,12 @@ TF_MODEL_MAPPING = OrderedDict(
(FunnelConfig, TFFunnelModel),
(DPRConfig, TFDPRQuestionEncoder),
(MPNetConfig, TFMPNetModel),
(BartConfig, TFBartModel),
(MBartConfig, TFMBartModel),
(MarianConfig, TFMarianModel),
(PegasusConfig, TFPegasusModel),
(BlenderbotConfig, TFBlenderbotModel),
(BlenderbotSmallConfig, TFBlenderbotSmallModel),
]
)
@@ -328,6 +339,7 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
(MBartConfig, TFMBartForConditionalGeneration),
(PegasusConfig, TFPegasusForConditionalGeneration),
(BlenderbotConfig, TFBlenderbotForConditionalGeneration),
(BlenderbotSmallConfig, TFBlenderbotSmallForConditionalGeneration),
(BartConfig, TFBartForConditionalGeneration),
]
)

View File

@@ -24,6 +24,7 @@ from ..bart.tokenization_bart import BartTokenizer
from ..bert.tokenization_bert import BertTokenizer
from ..bert_japanese.tokenization_bert_japanese import BertJapaneseTokenizer
from ..bertweet.tokenization_bertweet import BertweetTokenizer
from ..blenderbot.tokenization_blenderbot import BlenderbotTokenizer
from ..blenderbot_small.tokenization_blenderbot_small import BlenderbotSmallTokenizer
from ..ctrl.tokenization_ctrl import CTRLTokenizer
from ..deberta.tokenization_deberta import DebertaTokenizer
@@ -58,6 +59,7 @@ from .configuration_auto import (
BertConfig,
BertGenerationConfig,
BlenderbotConfig,
BlenderbotSmallConfig,
CamembertConfig,
CTRLConfig,
DebertaConfig,
@@ -201,7 +203,8 @@ TOKENIZER_MAPPING = OrderedDict(
(MBartConfig, (MBartTokenizer, MBartTokenizerFast)),
(XLMRobertaConfig, (XLMRobertaTokenizer, XLMRobertaTokenizerFast)),
(MarianConfig, (MarianTokenizer, None)),
(BlenderbotConfig, (BlenderbotSmallTokenizer, None)),
(BlenderbotSmallConfig, (BlenderbotSmallTokenizer, None)),
(BlenderbotConfig, (BlenderbotTokenizer, None)),
(LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)),
(BartConfig, (BartTokenizer, BartTokenizerFast)),
(LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)),

View File

@@ -170,16 +170,6 @@ class BartConfig(PretrainedConfig):
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
self.force_bos_token_to_be_generated = force_bos_token_to_be_generated # only relevant for CNN
# IMPORTANT
# DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
self.extra_pos_embeddings = 2
self.normalize_before = False
self.add_final_layer_norm = False
self.do_blenderbot_90_layernorm = False
self.normalize_embedding = True
self.static_position_embeddings = False
self.add_bias_logits = False
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads

View File

@@ -1,5 +1,5 @@
# coding=utf-8
# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -12,19 +12,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TF BART model, ported from the fairseq repo."""
""" TF 2.0 Bart model. """
import math
import random
import warnings
from typing import Dict, Optional, Tuple, Union
import numpy as np
import tensorflow as tf
from ...activations_tf import ACT2FN
from ...activations_tf import get_tf_activation
from ...file_utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
replace_return_docstrings,
@@ -55,13 +54,14 @@ logger = logging.get_logger(__name__)
_CONFIG_FOR_DOC = "BartConfig"
_TOKENIZER_FOR_DOC = "BartTokenizer"
LARGE_NEGATIVE = -1e8
def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, eos_token_id: int):
def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
shifted_input_ids = tf.cast(input_ids, tf.int32)
shifted_input_ids = tf.roll(shifted_input_ids, 1, axis=-1)
start_tokens = tf.fill((shape_list(shifted_input_ids)[0], 1), eos_token_id)
start_tokens = tf.fill((shape_list(shifted_input_ids)[0], 1), decoder_start_token_id)
shifted_input_ids = tf.concat([start_tokens, shifted_input_ids[:, 1:]], -1)
# replace possible -100 values in labels by `pad_token_id`
shifted_input_ids = tf.where(
@@ -94,7 +94,7 @@ def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: i
return tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length))
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None, past_key_values_length: int = 0):
"""
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
"""
@@ -108,18 +108,15 @@ def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
class TFBartLearnedPositionalEmbedding(TFSharedEmbeddings):
"""
This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
the forward function.
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset, **kwargs):
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, **kwargs):
assert padding_idx is not None, "padding_idx cannot be None"
# Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models dont have this hack
self.offset = offset
assert padding_idx is not None, "padding_idx cannot be None"
num_embeddings += offset
super().__init__(num_embeddings, embedding_dim, **kwargs)
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs)
def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
"""Input is expected to be of size [bsz x seqlen]."""
@@ -128,56 +125,7 @@ class TFBartLearnedPositionalEmbedding(TFSharedEmbeddings):
positions = tf.range(
past_key_values_length, seq_len + past_key_values_length, delta=1, dtype=tf.int32, name="range"
)
return super().call(positions + self.offset) # super object is not callable for some reason
class TFBartSinusoidalPositionalEmbedding(tf.keras.layers.Embedding):
"""This module produces sinusoidal positional embeddings of any length."""
def __init__(self, num_positions: int, embedding_dim: int, **kwargs):
if embedding_dim % 2 != 0:
raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported")
super().__init__(
num_positions,
embedding_dim,
**kwargs,
)
def build(self, input_shape: tf.TensorShape):
"""
Build shared token embedding layer Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
super().build(input_shape) # Instantiates self.weight so it can be loaded
weight: np.ndarray = self._init_weight(self.input_dim, self.output_dim)
self.set_weights([weight]) # overwrite self.weight to correct value
@staticmethod
def _init_weight(n_pos: int, dim: int):
"""
Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
the 2nd half of the vector. [dim // 2:]
"""
position_enc = np.array(
[[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
)
# index 0 is all zero
position_enc[:, 0 : dim // 2] = np.sin(position_enc[:, 0::2])
position_enc[:, dim // 2 :] = np.cos(position_enc[:, 1::2])
# convert to tensor
table = tf.convert_to_tensor(position_enc, dtype=tf.float32)
tf.stop_gradient(table)
return table
def call(self, input_shape: tf.TensorShape, past_key_values_length: int = 0):
"""Input is expected to be of size [bsz x seqlen]."""
bsz, seq_len = input_shape[:2]
positions = tf.range(
past_key_values_length, seq_len + past_key_values_length, delta=1, dtype=tf.int32, name="range"
)
return super().call(positions)
return super().call(positions + self.offset)
class TFBartAttention(tf.keras.layers.Layer):
@@ -310,10 +258,9 @@ class TFBartEncoderLayer(tf.keras.layers.Layer):
self.self_attn = TFBartAttention(
self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
)
self.normalize_before = config.normalize_before
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.activation_fn = ACT2FN[config.activation_function]
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
self.fc1 = tf.keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
self.fc2 = tf.keras.layers.Dense(self.embed_dim, name="fc2")
@@ -327,8 +274,6 @@ class TFBartEncoderLayer(tf.keras.layers.Layer):
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
"""
residual = hidden_states
if self.normalize_before:
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, self_attn_weights, _ = self.self_attn(
hidden_states=hidden_states, attention_mask=attention_mask
)
@@ -339,19 +284,15 @@ class TFBartEncoderLayer(tf.keras.layers.Layer):
)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
if not self.normalize_before:
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states = self.self_attn_layer_norm(hidden_states)
residual = hidden_states
if self.normalize_before:
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = self.activation_dropout(hidden_states, training=training)
hidden_states = self.fc2(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
if not self.normalize_before:
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.final_layer_norm(hidden_states)
return hidden_states, self_attn_weights
@@ -368,9 +309,8 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
is_decoder=True,
)
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.activation_fn = ACT2FN[config.activation_function]
self.activation_fn = get_tf_activation(config.activation_function)
self.activation_dropout = tf.keras.layers.Dropout(config.activation_dropout)
self.normalize_before = config.normalize_before
self.self_attn_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
self.encoder_attn = TFBartAttention(
@@ -405,8 +345,6 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
past_key_value (:obj:`Tuple(tf.Tensor)`): cached past key and value projection states
"""
residual = hidden_states
if self.normalize_before:
hidden_states = self.self_attn_layer_norm(hidden_states)
# Self Attention
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
@@ -419,15 +357,12 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
if not self.normalize_before:
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states = self.self_attn_layer_norm(hidden_states)
# Cross-Attention Block
cross_attn_present_key_value = None
if encoder_hidden_states is not None:
residual = hidden_states
if self.normalize_before:
hidden_states = self.encoder_attn_layer_norm(hidden_states)
# cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
@@ -439,24 +374,19 @@ class TFBartDecoderLayer(tf.keras.layers.Layer):
)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
if not self.normalize_before:
hidden_states = self.encoder_attn_layer_norm(hidden_states)
hidden_states = self.encoder_attn_layer_norm(hidden_states)
# add cross-attn to positions 3,4 of present_key_value tuple
present_key_value = present_key_value + cross_attn_present_key_value
# Fully Connected
residual = hidden_states
if self.normalize_before:
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = self.activation_dropout(hidden_states, training=training)
hidden_states = self.fc2(hidden_states)
hidden_states = self.dropout(hidden_states, training=training)
hidden_states = residual + hidden_states
if not self.normalize_before:
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.final_layer_norm(hidden_states)
return (
hidden_states,
@@ -472,8 +402,8 @@ class TFBartPretrainedModel(TFPreTrainedModel):
@property
def dummy_inputs(self):
pad_token = 1
input_ids = tf.cast(tf.constant(DUMMY_INPUTS), tf.int32)
decoder_input_ids = tf.cast(tf.constant(DUMMY_INPUTS), tf.int32)
input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
decoder_input_ids = tf.cast(tf.convert_to_tensor(DUMMY_INPUTS), tf.int32)
dummy_inputs = {
"decoder_input_ids": decoder_input_ids,
"attention_mask": tf.math.not_equal(input_ids, pad_token),
@@ -520,14 +450,6 @@ class TFBartPretrainedModel(TFPreTrainedModel):
return self.serving_output(output)
class TFPretrainedBartModel(TFBartPretrainedModel):
def __init_subclass__(self):
warnings.warn(
"The class `TFPretrainedBartModel` has been deprecated, please use `TFBartPretrainedModel` instead.",
FutureWarning,
)
BART_START_DOCSTRING = r"""
This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
generic methods the library implements for all its model (such as downloading or saving, resizing the input
@@ -563,6 +485,36 @@ BART_START_DOCSTRING = r"""
model weights.
"""
BART_GENERATION_EXAMPLE = r"""
Summarization example::
>>> from transformers import BartTokenizer, TFBartForConditionalGeneration, BartConfig
>>> model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large')
>>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
>>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
>>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='tf')
>>> # Generate Summary
>>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
>>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
Mask filling example::
>>> from transformers import BartTokenizer, TFBartForConditionalGeneration
>>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
>>> TXT = "My friends are <mask> but they eat too many carbs."
>>> model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large')
>>> input_ids = tokenizer([TXT], return_tensors='tf')['input_ids']
>>> logits = model(input_ids).logits
>>> probs = tf.nn.softmax(logits[0])
>>> # probs[5] is associated with the mask token
"""
BART_INPUTS_DOCSTRING = r"""
Args:
input_ids (:obj:`tf.Tensor` of shape :obj:`({0})`):
@@ -581,8 +533,21 @@ BART_INPUTS_DOCSTRING = r"""
`What are attention masks? <../glossary.html#attention-mask>`__
decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
Provide for translation and summarization training. By default, the model will create this tensor by
shifting the input_ids right, following the paper.
Indices of decoder input sequence tokens in the vocabulary.
Indices can be obtained using :class:`~transformers.BartTokenizer`. See
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
details.
`What are input IDs? <../glossary.html#input-ids>`__
Bart uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
:obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
:obj:`past_key_values`).
For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
:obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
the right for denoising pre-training following the paper.
decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
@@ -603,7 +568,7 @@ BART_INPUTS_DOCSTRING = r"""
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
more detail.
return_dict (:obj:`bool`, `optional`):
Whether or not to return a :class:`~transformers.file_utils.TFModelOutput` instead of a plain tuple.
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
training (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).
@@ -626,36 +591,19 @@ class TFBartEncoder(tf.keras.layers.Layer):
self.config = config
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.layerdrop = config.encoder_layerdrop
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.embed_tokens = embed_tokens
if config.static_position_embeddings:
self.embed_positions = TFBartSinusoidalPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
name="embed_positions",
)
else:
self.embed_positions = TFBartLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
self.padding_idx,
config.extra_pos_embeddings,
name="embed_positions",
)
self.embed_positions = TFBartLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
self.padding_idx,
name="embed_positions",
)
self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
self.layernorm_embedding = (
tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
if config.normalize_embedding
else tf.keras.layers.Layer()
)
self.layer_norm = (
tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
if config.add_final_layer_norm
else None
)
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
def set_embed_tokens(self, embed_tokens):
self.embed_tokens = embed_tokens
@@ -725,11 +673,7 @@ class TFBartEncoder(tf.keras.layers.Layer):
raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs["inputs_embeds"] is None:
inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"])
else:
inputs["inputs_embeds"] = inputs["inputs_embeds"]
inputs["inputs_embeds"] = inputs["inputs_embeds"] * self.embed_scale
inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"]) * self.embed_scale
embed_pos = self.embed_positions(input_shape)
hidden_states = inputs["inputs_embeds"] + embed_pos
@@ -739,7 +683,9 @@ class TFBartEncoder(tf.keras.layers.Layer):
# check attention mask and invert
if inputs["attention_mask"] is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
inputs["attention_mask"] = _expand_mask(inputs["attention_mask"])
attention_mask = _expand_mask(inputs["attention_mask"])
else:
attention_mask = None
encoder_states = () if inputs["output_hidden_states"] else None
all_attentions = () if inputs["output_attentions"] else None
@@ -754,12 +700,11 @@ class TFBartEncoder(tf.keras.layers.Layer):
if inputs["training"] and (dropout_probability < self.layerdrop): # skip the layer
continue
hidden_states, attn = encoder_layer(hidden_states, inputs["attention_mask"])
hidden_states, attn = encoder_layer(hidden_states, attention_mask)
if inputs["output_attentions"]:
all_attentions += (attn,)
if self.layer_norm:
hidden_states = self.layer_norm(hidden_states)
if inputs["output_hidden_states"]:
encoder_states = encoder_states + (hidden_states,)
@@ -786,36 +731,18 @@ class TFBartDecoder(tf.keras.layers.Layer):
self.config = config
self.padding_idx = config.pad_token_id
self.embed_tokens = embed_tokens
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.layerdrop = config.decoder_layerdrop
if config.static_position_embeddings:
self.embed_positions = TFBartSinusoidalPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
name="embed_positions",
)
else:
self.embed_positions = TFBartLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
self.padding_idx,
config.extra_pos_embeddings,
name="embed_positions",
)
self.embed_positions = TFBartLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
self.padding_idx,
name="embed_positions",
)
self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
self.layers = [TFBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
self.layernorm_embedding = (
tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
if config.normalize_embedding
else tf.keras.layers.Layer()
)
self.layer_norm = (
tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
if config.add_final_layer_norm
else None
)
self.layernorm_embedding = tf.keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding")
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.do_blenderbot_90_layernorm = config.do_blenderbot_90_layernorm
def set_embed_tokens(self, embed_tokens):
self.embed_tokens = embed_tokens
@@ -912,16 +839,16 @@ class TFBartDecoder(tf.keras.layers.Layer):
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
past_key_values_length = (
inputs["past_key_values"][0][0].shape[2] if inputs["past_key_values"] is not None else 0
shape_list(inputs["past_key_values"][0][0])[2] if inputs["past_key_values"] is not None else 0
)
# embed positions
positions = self.embed_positions(input_shape, past_key_values_length)
if inputs["inputs_embeds"] is None:
inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"])
inputs["inputs_embeds"] = self.embed_tokens(inputs["input_ids"]) * self.embed_scale
hidden_states = inputs["inputs_embeds"] * self.embed_scale
hidden_states = inputs["inputs_embeds"]
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
if input_shape[-1] > 1:
@@ -931,35 +858,16 @@ class TFBartDecoder(tf.keras.layers.Layer):
tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1]
)
if inputs["attention_mask"] is None and inputs["input_ids"] is not None and input_shape[-1] > 1:
inputs["attention_mask"] = tf.cast(
tf.math.not_equal(inputs["input_ids"], self.config.pad_token_id), inputs["input_ids"].dtype
if inputs["attention_mask"] is not None and input_shape[-1] > 1:
combined_attention_mask = combined_attention_mask + _expand_mask(
inputs["attention_mask"], tgt_len=input_shape[-1]
)
inputs["attention_mask"] = tf.concat(
[
tf.ones((input_shape[0], past_key_values_length), dtype=inputs["attention_mask"].dtype),
inputs["attention_mask"],
],
axis=-1,
)
else:
inputs["attention_mask"] = tf.ones(
(input_shape[0], input_shape[1] + past_key_values_length), dtype=tf.int32
)
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
combined_attention_mask = combined_attention_mask + _expand_mask(
inputs["attention_mask"], tgt_len=input_shape[-1]
)
if inputs["encoder_hidden_states"] is not None and inputs["encoder_attention_mask"] is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
inputs["encoder_attention_mask"] = _expand_mask(inputs["encoder_attention_mask"], tgt_len=input_shape[-1])
if self.do_blenderbot_90_layernorm:
hidden_states = self.layernorm_embedding(hidden_states) + positions
else:
hidden_states = self.layernorm_embedding(hidden_states + positions)
hidden_states = self.layernorm_embedding(hidden_states + positions)
hidden_states = self.dropout(hidden_states, training=inputs["training"])
# decoder layers
@@ -991,10 +899,6 @@ class TFBartDecoder(tf.keras.layers.Layer):
if inputs["output_attentions"]:
all_self_attns += (layer_self_attn,)
if self.layer_norm is not None: # same as if config.add_final_layer_norm
hidden_states = self.layer_norm(hidden_states)
# Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
if inputs["output_hidden_states"]:
all_hidden_states += (hidden_states,)
else:
@@ -1002,7 +906,7 @@ class TFBartDecoder(tf.keras.layers.Layer):
all_self_attns = list(all_self_attns) if inputs["output_attentions"] else None
present_key_values = (inputs["encoder_hidden_states"], present_key_values) if inputs["use_cache"] else None
present_key_values = (encoder_hidden_states, present_key_values) if inputs["use_cache"] else None
if not inputs["return_dict"]:
return hidden_states, present_key_values, all_hidden_states, all_self_attns
@@ -1098,7 +1002,7 @@ class TFBartModel(TFBartPretrainedModel):
if inputs["decoder_input_ids"] is None and inputs["input_ids"] is not None:
inputs["decoder_input_ids"] = shift_tokens_right(
inputs["input_ids"], self.config.pad_token_id, self.config.eos_token_id
inputs["input_ids"], self.config.pad_token_id, self.config.decoder_start_token_id
)
if inputs["encoder_outputs"] is None:
@@ -1206,6 +1110,7 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel):
@add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(BART_GENERATION_EXAMPLE)
def call(
self,
input_ids=None,
@@ -1224,22 +1129,14 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel):
training=False,
**kwargs,
):
"""
r"""
labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
Returns:
Examples::
# Mask filling only works for bart-large
from transformers import BartTokenizer, TFBartForConditionalGeneration
import tensorflow as tf
mname = 'facebook/bart-large'
tokenizer = BartTokenizer.from_pretrained(mname)
TXT = "My friends are <mask> but they eat too many carbs."
model = TFBartForConditionalGeneration.from_pretrained(mname)
batch = tokenizer([TXT], return_tensors='tf')
logits = model(inputs=batch.input_ids).logits
probs = tf.nn.softmax(logits[0])
# probs[5] is associated with the mask token
"""
inputs = input_processing(
func=self.call,
@@ -1265,7 +1162,7 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel):
inputs["use_cache"] = False
if inputs["decoder_input_ids"] is None:
inputs["decoder_input_ids"] = shift_tokens_right(
inputs["labels"], self.config.pad_token_id, self.config.eos_token_id
inputs["labels"], self.config.pad_token_id, self.config.decoder_start_token_id
)
outputs = self.model(
@@ -1363,7 +1260,8 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel):
reordered_past = ()
for layer_past_key_values in past_key_values:
reordered_past += (
tuple(tf.gather(layer_past_key_value, beam_idx) for layer_past_key_value in layer_past_key_values),
tuple(tf.gather(layer_past_key_value, beam_idx) for layer_past_key_value in layer_past_key_values[:2])
+ layer_past_key_values[2:],
)
return (past[0], reordered_past)

View File

@@ -36,7 +36,7 @@ if is_torch_available():
if is_tf_available():
_import_structure["modeling_tf_blenderbot"] = ["TFBlenderbotForConditionalGeneration"]
_import_structure["modeling_tf_blenderbot"] = ["TFBlenderbotForConditionalGeneration", "TFBlenderbotModel"]
if TYPE_CHECKING:
@@ -52,7 +52,7 @@ if TYPE_CHECKING:
)
if is_tf_available():
from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration
from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration, TFBlenderbotModel
else:
import importlib

View File

@@ -161,17 +161,6 @@ class BlenderbotConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
# IMPORTANT
# DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
self.extra_pos_embeddings = 0
self.normalize_before = True
self.add_final_layer_norm = True
self.do_blenderbot_90_layernorm = True
self.normalize_embedding = False
self.static_position_embeddings = False
self.add_bias_logits = False
self.force_bos_token_to_be_generated = False
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads

File diff suppressed because it is too large Load Diff

View File

@@ -17,7 +17,7 @@
# limitations under the License.
from typing import TYPE_CHECKING
from ...file_utils import _BaseLazyModule, is_torch_available
from ...file_utils import _BaseLazyModule, is_tf_available, is_torch_available
_import_structure = {
@@ -33,6 +33,11 @@ if is_torch_available():
"BlenderbotSmallPreTrainedModel",
]
if is_tf_available():
_import_structure["modeling_tf_blenderbot_small"] = [
"TFBlenderbotSmallForConditionalGeneration",
"TFBlenderbotSmallModel",
]
if TYPE_CHECKING:
from .configuration_blenderbot_small import BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotSmallConfig
@@ -46,6 +51,9 @@ if TYPE_CHECKING:
BlenderbotSmallPreTrainedModel,
)
if is_tf_available():
from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration, TFBlenderbotModel
else:
import importlib
import os

View File

@@ -866,6 +866,7 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel):
all_self_attns = () if output_attentions else None
all_cross_attentions = () if output_attentions else None
next_decoder_cache = () if use_cache else None
for idx, decoder_layer in enumerate(self.layers):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:

File diff suppressed because it is too large Load Diff

View File

@@ -30,7 +30,7 @@ logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
# "tokenizer_config_file": "tokenizer_config.json",
"tokenizer_config_file": "tokenizer_config.json",
}
@@ -75,13 +75,20 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
"""
vocab_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
vocab_files_names = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config": "tokenizer_config.json",
}
pretrained_vocab_files_map = {
"vocab_file": {
"facebook/blenderbot_small-90M": "https://cdn.huggingface.co/facebook/blenderbot_small-90M/vocab.json"
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/blob/main/vocab.json"
},
"merges_file": {
"facebook/blenderbot_small-90M": "https://cdn.huggingface.co/facebook/blenderbot_small-90M/merges.txt"
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/blob/main/merges.txt"
},
"tokenizer_config_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/blob/main/tokenizer.json"
},
}
max_model_input_sizes = {"facebook/blenderbot_small-90M": 512}

View File

@@ -1475,7 +1475,7 @@ LED_INPUTS_DOCSTRING = r"""
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
more detail.
return_dict (:obj:`bool`, `optional`):
Whether or not to return a :class:`~transformers.file_utils.TFModelOutput` instead of a plain tuple.
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
training (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to use the model in training mode (some modules like dropout modules have different
behaviors between training and evaluation).

View File

@@ -42,7 +42,7 @@ if is_torch_available():
]
if is_tf_available():
_import_structure["modeling_tf_marian"] = ["TFMarianMTModel"]
_import_structure["modeling_tf_marian"] = ["TFMarianMTModel", "TFMarianModel"]
if TYPE_CHECKING:
@@ -60,7 +60,7 @@ if TYPE_CHECKING:
)
if is_tf_available():
from .modeling_tf_marian import TFMarianMTModel
from .modeling_tf_marian import TFMarianModel, TFMarianMTModel
else:
import importlib

View File

@@ -159,17 +159,6 @@ class MarianConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
# IMPORTANT
# DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
self.extra_pos_embeddings = 0
self.normalize_before = False
self.add_final_layer_norm = False
self.do_blenderbot_90_layernorm = False
self.normalize_embedding = False
self.static_position_embeddings = True
self.add_bias_logits = False
self.force_bos_token_to_be_generated = False
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads

File diff suppressed because it is too large Load Diff

View File

@@ -47,7 +47,7 @@ if is_torch_available():
]
if is_tf_available():
_import_structure["modeling_tf_mbart"] = ["TFMBartForConditionalGeneration"]
_import_structure["modeling_tf_mbart"] = ["TFMBartForConditionalGeneration", "TFMBartModel"]
if TYPE_CHECKING:
@@ -70,7 +70,7 @@ if TYPE_CHECKING:
)
if is_tf_available():
from .modeling_tf_mbart import TFMBartForConditionalGeneration
from .modeling_tf_mbart import TFMBartForConditionalGeneration, TFMBartModel
else:
import importlib

View File

@@ -159,17 +159,6 @@ class MBartConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
# IMPORTANT
# DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
self.extra_pos_embeddings = 2
self.normalize_before = True
self.add_final_layer_norm = True
self.do_blenderbot_90_layernorm = False
self.normalize_embedding = True
self.static_position_embeddings = False
self.add_bias_logits = False
self.force_bos_token_to_be_generated = False
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads

File diff suppressed because it is too large Load Diff

View File

@@ -45,7 +45,7 @@ if is_torch_available():
]
if is_tf_available():
_import_structure["modeling_tf_pegasus"] = ["TFPegasusForConditionalGeneration"]
_import_structure["modeling_tf_pegasus"] = ["TFPegasusForConditionalGeneration", "TFPegasusModel"]
if TYPE_CHECKING:
@@ -66,7 +66,7 @@ if TYPE_CHECKING:
)
if is_tf_available():
from .modeling_tf_pegasus import TFPegasusForConditionalGeneration
from .modeling_tf_pegasus import TFPegasusForConditionalGeneration, TFPegasusModel
else:
import importlib

View File

@@ -159,17 +159,6 @@ class PegasusConfig(PretrainedConfig):
self.gradient_checkpointing = gradient_checkpointing
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
# IMPORTANT
# DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
self.extra_pos_embeddings = 0
self.normalize_before = True
self.add_final_layer_norm = True
self.do_blenderbot_90_layernorm = False
self.normalize_embedding = False
self.static_position_embeddings = True
self.add_bias_logits = False
self.force_bos_token_to_be_generated = False
@property
def num_attention_heads(self) -> int:
return self.encoder_attention_heads

File diff suppressed because it is too large Load Diff

View File

@@ -369,6 +369,33 @@ class TFBlenderbotForConditionalGeneration:
requires_tf(self)
class TFBlenderbotModel:
def __init__(self, *args, **kwargs):
requires_tf(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_tf(self)
class TFBlenderbotSmallForConditionalGeneration:
def __init__(self, *args, **kwargs):
requires_tf(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_tf(self)
class TFBlenderbotSmallModel:
def __init__(self, *args, **kwargs):
requires_tf(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_tf(self)
TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -952,6 +979,11 @@ class TFLxmertVisualFeatureEncoder:
requires_tf(self)
class TFMarian:
def __init__(self, *args, **kwargs):
requires_tf(self)
class TFMarianMTModel:
def __init__(self, *args, **kwargs):
requires_tf(self)
@@ -970,6 +1002,15 @@ class TFMBartForConditionalGeneration:
requires_tf(self)
class TFMBartModel:
def __init__(self, *args, **kwargs):
requires_tf(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_tf(self)
TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
@@ -1211,6 +1252,15 @@ class TFPegasusForConditionalGeneration:
requires_tf(self)
class TFPegasusModel:
def __init__(self, *args, **kwargs):
requires_tf(self)
@classmethod
def from_pretrained(self, *args, **kwargs):
requires_tf(self)
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None