[PyTorch Bart] Split Bart into different models (#9343)

* first try

* remove old template

* finish bart

* finish mbart

* delete unnecessary line

* init pegasus

* save intermediate

* correct pegasus

* finish pegasus

* remove cookie cutter leftover

* add marian

* finish blenderbot

* replace in file

* correctly split blenderbot

* delete "old" folder

* correct "add statement"

* adapt config for tf comp

* correct configs for tf

* remove ipdb

* fix more stuff

* fix mbart

* push pegasus fix

* fix mbart

* more fixes

* fix research projects code

* finish docs for bart, mbart, and marian

* delete unnecessary file

* correct attn typo

* correct configs

* remove pegasus for seq class

* correct peg docs

* correct peg docs

* finish configs

* further improve docs

* add copied from statements to mbart

* fix copied from in mbart

* add copy statements to marian

* add copied from to marian

* add pegasus copied from

* finish pegasus

* finish copied from

* Apply suggestions from code review

* make style

* backward comp blenderbot

* apply lysandres and sylvains suggestions

* apply suggestions

* push last fixes

* fix docs

* fix tok tests

* fix imports code style

* fix doc
This commit is contained in:
Patrick von Platen
2021-01-05 22:00:05 +01:00
committed by GitHub
parent 4eec5d0cf6
commit eef66035a2
59 changed files with 9273 additions and 2271 deletions

View File

@@ -135,6 +135,10 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
>>> configuration = model.config
"""
model_type = "{{cookiecutter.lowercase_modelname}}"
{% if cookiecutter.is_encoder_decoder_model == "False" -%}
{% else -%}
keys_to_ignore_at_inference = ["past_key_values"]
{% endif -%}
def __init__(
self,

View File

@@ -1849,7 +1849,7 @@ class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel):
decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
Provide for translation and summarization training. By default, the model will create this tensor by
shifting the input_ids right, following the paper.
decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.

View File

@@ -1601,17 +1601,6 @@ def _expand_mask(
return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
def {{cookiecutter.camelcase_modelname}}LayerNorm(normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True):
if torch.cuda.is_available():
try:
from apex.normalization import FusedLayerNorm
return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
except ImportError:
pass
return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
class {{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
@@ -1619,7 +1608,6 @@ class {{cookiecutter.camelcase_modelname}}LearnedPositionalEmbedding(nn.Embeddin
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
num_embeddings
super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
@@ -1774,13 +1762,13 @@ class {{cookiecutter.camelcase_modelname}}EncoderLayer(nn.Module):
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
)
self.self_attn_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
"""
@@ -1788,8 +1776,9 @@ class {{cookiecutter.camelcase_modelname}}EncoderLayer(nn.Module):
hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
attention_mask (:obj:`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (:obj:`bool`): Whether the base model outputs attentions.
This requires the attentions tensor to be reshaped in this function.
output_attentions (:obj:`bool`, `optional`):
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states, attn_weights, _ = self.self_attn(
@@ -1834,17 +1823,17 @@ class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module):
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = {{cookiecutter.camelcase_modelname}}Attention(
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
)
self.encoder_attn_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = {{cookiecutter.camelcase_modelname}}LayerNorm(self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
@@ -1865,8 +1854,9 @@ class {{cookiecutter.camelcase_modelname}}DecoderLayer(nn.Module):
encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
output_attentions (:obj:`bool`): Whether the base model outputs attentions.
This requires the attentions tensor to be reshaped in this function.
output_attentions (:obj:`bool`, `optional`):
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
returned tensors for more detail.
"""
residual = hidden_states
@@ -2031,7 +2021,7 @@ class {{cookiecutter.camelcase_modelname}}PreTrainedModel(PreTrainedModel):
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
Provide for translation and summarization training. By default, the model will create this tensor by
shifting the :obj:`input_ids` to the right, following the paper.
decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
also be used by default.
@@ -2094,7 +2084,7 @@ class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_model
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
if embed_tokens is not None:
self.embed_tokens = embed_tokens
@@ -2107,7 +2097,7 @@ class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_model
self.padding_idx,
)
self.layers = nn.ModuleList([{{cookiecutter.camelcase_modelname}}EncoderLayer(config) for _ in range(config.encoder_layers)])
self.layernorm_embedding = {{cookiecutter.camelcase_modelname}}LayerNorm(embed_dim)
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.init_weights()
@@ -2251,7 +2241,7 @@ class {{cookiecutter.camelcase_modelname}}Decoder({{cookiecutter.camelcase_model
self.padding_idx,
)
self.layers = nn.ModuleList([{{cookiecutter.camelcase_modelname}}DecoderLayer(config) for _ in range(config.decoder_layers)])
self.layernorm_embedding = {{cookiecutter.camelcase_modelname}}LayerNorm(config.d_model)
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.init_weights()
@@ -2513,7 +2503,7 @@ class {{cookiecutter.camelcase_modelname}}Model({{cookiecutter.camelcase_modelna
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0],

View File

@@ -480,8 +480,6 @@ import copy
import tempfile
import unittest
import timeout_decorator # noqa
from transformers import is_torch_available
from transformers.file_utils import cached_property
from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device