[PyTorch Bart] Split Bart into different models (#9343)
* first try * remove old template * finish bart * finish mbart * delete unnecessary line * init pegasus * save intermediate * correct pegasus * finish pegasus * remove cookie cutter leftover * add marian * finish blenderbot * replace in file * correctly split blenderbot * delete "old" folder * correct "add statement" * adapt config for tf comp * correct configs for tf * remove ipdb * fix more stuff * fix mbart * push pegasus fix * fix mbart * more fixes * fix research projects code * finish docs for bart, mbart, and marian * delete unnecessary file * correct attn typo * correct configs * remove pegasus for seq class * correct peg docs * correct peg docs * finish configs * further improve docs * add copied from statements to mbart * fix copied from in mbart * add copy statements to marian * add copied from to marian * add pegasus copied from * finish pegasus * finish copied from * Apply suggestions from code review * make style * backward comp blenderbot * apply lysandres and sylvains suggestions * apply suggestions * push last fixes * fix docs * fix tok tests * fix imports code style * fix doc
This commit is contained in:
committed by
GitHub
parent
4eec5d0cf6
commit
eef66035a2
@@ -120,11 +120,11 @@ from .models.bert import (
|
||||
from .models.bert_generation import BertGenerationConfig
|
||||
from .models.bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
|
||||
from .models.bertweet import BertweetTokenizer
|
||||
from .models.blenderbot import (
|
||||
BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
BlenderbotConfig,
|
||||
from .models.blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig, BlenderbotTokenizer
|
||||
from .models.blenderbot_small import (
|
||||
BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
BlenderbotSmallConfig,
|
||||
BlenderbotSmallTokenizer,
|
||||
BlenderbotTokenizer,
|
||||
)
|
||||
from .models.camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
|
||||
from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
|
||||
@@ -415,6 +415,11 @@ if is_torch_available():
|
||||
BlenderbotForConditionalGeneration,
|
||||
BlenderbotModel,
|
||||
)
|
||||
from .models.blenderbot_small import (
|
||||
BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
BlenderbotSmallForConditionalGeneration,
|
||||
BlenderbotSmallModel,
|
||||
)
|
||||
from .models.camembert import (
|
||||
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
CamembertForCausalLM,
|
||||
@@ -536,8 +541,13 @@ if is_torch_available():
|
||||
LxmertVisualFeatureEncoder,
|
||||
LxmertXLayer,
|
||||
)
|
||||
from .models.marian import MarianMTModel
|
||||
from .models.mbart import MBartForConditionalGeneration, MBartModel
|
||||
from .models.marian import MarianModel, MarianMTModel
|
||||
from .models.mbart import (
|
||||
MBartForConditionalGeneration,
|
||||
MBartForQuestionAnswering,
|
||||
MBartForSequenceClassification,
|
||||
MBartModel,
|
||||
)
|
||||
from .models.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
|
||||
from .models.mobilebert import (
|
||||
MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
|
||||
@@ -23,6 +23,10 @@ from ..bart.configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartCo
|
||||
from ..bert.configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
|
||||
from ..bert_generation.configuration_bert_generation import BertGenerationConfig
|
||||
from ..blenderbot.configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig
|
||||
from ..blenderbot_small.configuration_blenderbot_small import (
|
||||
BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
BlenderbotSmallConfig,
|
||||
)
|
||||
from ..camembert.configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
|
||||
from ..ctrl.configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
|
||||
from ..deberta.configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig
|
||||
@@ -68,6 +72,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
|
||||
for pretrained_map in [
|
||||
# Add archive maps here
|
||||
LED_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
BART_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
@@ -108,6 +113,7 @@ CONFIG_MAPPING = OrderedDict(
|
||||
[
|
||||
# Add configs here
|
||||
("led", LEDConfig),
|
||||
("blenderbot-small", BlenderbotSmallConfig),
|
||||
("retribert", RetriBertConfig),
|
||||
("mt5", MT5Config),
|
||||
("t5", T5Config),
|
||||
@@ -154,6 +160,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
|
||||
[
|
||||
# Add full (and cased) model names here
|
||||
("led", "LED"),
|
||||
("blenderbot-small", "BlenderbotSmall"),
|
||||
("retribert", "RetriBERT"),
|
||||
("t5", "T5"),
|
||||
("mobilebert", "MobileBERT"),
|
||||
|
||||
@@ -51,6 +51,7 @@ from ..bert.modeling_bert import (
|
||||
)
|
||||
from ..bert_generation.modeling_bert_generation import BertGenerationDecoder, BertGenerationEncoder
|
||||
from ..blenderbot.modeling_blenderbot import BlenderbotForConditionalGeneration, BlenderbotModel
|
||||
from ..blenderbot_small.modeling_blenderbot_small import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel
|
||||
from ..camembert.modeling_camembert import (
|
||||
CamembertForCausalLM,
|
||||
CamembertForMaskedLM,
|
||||
@@ -116,8 +117,13 @@ from ..longformer.modeling_longformer import (
|
||||
LongformerModel,
|
||||
)
|
||||
from ..lxmert.modeling_lxmert import LxmertForPreTraining, LxmertForQuestionAnswering, LxmertModel
|
||||
from ..marian.modeling_marian import MarianMTModel
|
||||
from ..mbart.modeling_mbart import MBartForConditionalGeneration, MBartModel
|
||||
from ..marian.modeling_marian import MarianModel, MarianMTModel
|
||||
from ..mbart.modeling_mbart import (
|
||||
MBartForConditionalGeneration,
|
||||
MBartForQuestionAnswering,
|
||||
MBartForSequenceClassification,
|
||||
MBartModel,
|
||||
)
|
||||
from ..mobilebert.modeling_mobilebert import (
|
||||
MobileBertForMaskedLM,
|
||||
MobileBertForMultipleChoice,
|
||||
@@ -215,6 +221,7 @@ from .configuration_auto import (
|
||||
BertConfig,
|
||||
BertGenerationConfig,
|
||||
BlenderbotConfig,
|
||||
BlenderbotSmallConfig,
|
||||
CamembertConfig,
|
||||
CTRLConfig,
|
||||
DebertaConfig,
|
||||
@@ -260,6 +267,7 @@ MODEL_MAPPING = OrderedDict(
|
||||
[
|
||||
# Base model mapping
|
||||
(LEDConfig, LEDModel),
|
||||
(BlenderbotSmallConfig, BlenderbotSmallModel),
|
||||
(RetriBertConfig, RetriBertModel),
|
||||
(MT5Config, MT5Model),
|
||||
(T5Config, T5Model),
|
||||
@@ -297,6 +305,7 @@ MODEL_MAPPING = OrderedDict(
|
||||
(ProphetNetConfig, ProphetNetModel),
|
||||
(MPNetConfig, MPNetModel),
|
||||
(TapasConfig, TapasModel),
|
||||
(MarianConfig, MarianModel),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -336,6 +345,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
|
||||
[
|
||||
# Model with LM heads mapping
|
||||
(LEDConfig, LEDForConditionalGeneration),
|
||||
(BlenderbotSmallConfig, BlenderbotSmallForConditionalGeneration),
|
||||
(LayoutLMConfig, LayoutLMForMaskedLM),
|
||||
(T5Config, T5ForConditionalGeneration),
|
||||
(DistilBertConfig, DistilBertForMaskedLM),
|
||||
@@ -417,6 +427,7 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
|
||||
[
|
||||
# Model for Seq2Seq Causal LM mapping
|
||||
(LEDConfig, LEDForConditionalGeneration),
|
||||
(BlenderbotSmallConfig, BlenderbotSmallForConditionalGeneration),
|
||||
(MT5Config, MT5ForConditionalGeneration),
|
||||
(T5Config, T5ForConditionalGeneration),
|
||||
(PegasusConfig, PegasusForConditionalGeneration),
|
||||
@@ -439,6 +450,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
|
||||
(AlbertConfig, AlbertForSequenceClassification),
|
||||
(CamembertConfig, CamembertForSequenceClassification),
|
||||
(XLMRobertaConfig, XLMRobertaForSequenceClassification),
|
||||
(MBartConfig, MBartForSequenceClassification),
|
||||
(BartConfig, BartForSequenceClassification),
|
||||
(LongformerConfig, LongformerForSequenceClassification),
|
||||
(RobertaConfig, RobertaForSequenceClassification),
|
||||
@@ -469,6 +481,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
|
||||
(AlbertConfig, AlbertForQuestionAnswering),
|
||||
(CamembertConfig, CamembertForQuestionAnswering),
|
||||
(BartConfig, BartForQuestionAnswering),
|
||||
(MBartConfig, MBartForQuestionAnswering),
|
||||
(LongformerConfig, LongformerForQuestionAnswering),
|
||||
(XLMRobertaConfig, XLMRobertaForQuestionAnswering),
|
||||
(RobertaConfig, RobertaForQuestionAnswering),
|
||||
|
||||
@@ -24,7 +24,7 @@ from ..bart.tokenization_bart import BartTokenizer
|
||||
from ..bert.tokenization_bert import BertTokenizer
|
||||
from ..bert_japanese.tokenization_bert_japanese import BertJapaneseTokenizer
|
||||
from ..bertweet.tokenization_bertweet import BertweetTokenizer
|
||||
from ..blenderbot.tokenization_blenderbot import BlenderbotSmallTokenizer
|
||||
from ..blenderbot_small.tokenization_blenderbot_small import BlenderbotSmallTokenizer
|
||||
from ..ctrl.tokenization_ctrl import CTRLTokenizer
|
||||
from ..deberta.tokenization_deberta import DebertaTokenizer
|
||||
from ..distilbert.tokenization_distilbert import DistilBertTokenizer
|
||||
@@ -197,12 +197,12 @@ TOKENIZER_MAPPING = OrderedDict(
|
||||
(AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)),
|
||||
(CamembertConfig, (CamembertTokenizer, CamembertTokenizerFast)),
|
||||
(PegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)),
|
||||
(MBartConfig, (BarthezTokenizer, BarthezTokenizerFast)),
|
||||
(MBartConfig, (MBartTokenizer, MBartTokenizerFast)),
|
||||
(XLMRobertaConfig, (XLMRobertaTokenizer, XLMRobertaTokenizerFast)),
|
||||
(MarianConfig, (MarianTokenizer, None)),
|
||||
(BlenderbotConfig, (BlenderbotSmallTokenizer, None)),
|
||||
(LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)),
|
||||
(BartConfig, (BarthezTokenizer, BarthezTokenizerFast)),
|
||||
(BartConfig, (BartTokenizer, BartTokenizerFast)),
|
||||
(LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)),
|
||||
(RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
|
||||
|
||||
@@ -15,9 +15,8 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available
|
||||
from .configuration_bart import BartConfig
|
||||
from .configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig
|
||||
from .tokenization_bart import BartTokenizer
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
|
||||
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -12,7 +12,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" BART configuration """
|
||||
""" BART model configuration """
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
@@ -21,34 +21,33 @@ from ...utils import logging
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/config.json",
|
||||
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/config.json",
|
||||
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/config.json",
|
||||
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/config.json",
|
||||
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/config.json",
|
||||
"facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/config.json",
|
||||
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/config.json",
|
||||
# See all BART models at https://huggingface.co/models?filter=bart
|
||||
}
|
||||
|
||||
|
||||
class BartConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.BartModel`. It is used to
|
||||
instantiate a BART model according to the specified arguments, defining the model architecture.
|
||||
instantiate a BART model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the BART `facebook/bart-large
|
||||
<https://huggingface.co/facebook/bart-large>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 50265):
|
||||
Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.BartModel`.
|
||||
Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.BartModel` or
|
||||
:class:`~transformers.TFBartModel`.
|
||||
d_model (:obj:`int`, `optional`, defaults to 1024):
|
||||
Dimensionality of the layers and the pooler layer.
|
||||
encoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of encoder layers, 6 are used for the `bart-base` model.
|
||||
Number of encoder layers.
|
||||
decoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of decoder layers, 6 are used for the `bart-base` model.
|
||||
Number of decoder layers.
|
||||
encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
@@ -73,145 +72,113 @@ class BartConfig(PretrainedConfig):
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
init_std (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
This should be completed, specific to marian.
|
||||
normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Call layernorm before attention ops.
|
||||
normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Call layernorm after embeddings.
|
||||
static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Don't learn positional embeddings, use sinusoidal.
|
||||
add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Why not add another layernorm?
|
||||
do_blenderbot_90_layernorm (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Blenderbot-90m checkpoint uses `layernorm_embedding` one line earlier in the decoder.
|
||||
scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Scale embeddings by diving by sqrt(d_model).
|
||||
eos_token_id (:obj:`int`, `optional`, defaults to 2)
|
||||
End of stream token id.
|
||||
pad_token_id (:obj:`int`, `optional`, defaults to 1)
|
||||
Padding token id.
|
||||
bos_token_id (:obj:`int`, `optional`, defaults to 0)
|
||||
Beginning of stream token id.
|
||||
force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only
|
||||
:obj:`True` for `bart-large-cnn`.
|
||||
encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
|
||||
How many extra learned positional embeddings to use. Should be set to :obj:`pad_token_id+1`.
|
||||
num_labels: (:obj:`int`, `optional`, defaults to 3):
|
||||
The number of labels to use in :class:`~transformers.BartForSequenceClassification`.
|
||||
is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether this is an encoder/decoder model.
|
||||
force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only
|
||||
:obj:`True` for `bart-large-cnn`.
|
||||
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||
scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Scale embeddings by diving by sqrt(d_model).
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
num_labels: (:obj:`int`, `optional`, defaults to 3):
|
||||
The number of labels to use in :class:`~transformers.BartForSequenceClassification`.
|
||||
|
||||
Example::
|
||||
|
||||
>>> from transformers import BartModel, BartConfig
|
||||
|
||||
>>> # Initializing a BART facebook/bart-large style configuration
|
||||
>>> configuration = BartConfig()
|
||||
|
||||
>>> # Initializing a model from the facebook/bart-large style configuration
|
||||
>>> model = BartModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "bart"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
activation_dropout=0.0,
|
||||
extra_pos_embeddings=2,
|
||||
activation_function="gelu",
|
||||
vocab_size=50265,
|
||||
d_model=1024,
|
||||
encoder_ffn_dim=4096,
|
||||
max_position_embeddings=1024,
|
||||
encoder_layers=12,
|
||||
encoder_ffn_dim=4096,
|
||||
encoder_attention_heads=16,
|
||||
decoder_ffn_dim=4096,
|
||||
decoder_layers=12,
|
||||
decoder_ffn_dim=4096,
|
||||
decoder_attention_heads=16,
|
||||
encoder_layerdrop=0.0,
|
||||
decoder_layerdrop=0.0,
|
||||
attention_dropout=0.0,
|
||||
activation_function="gelu",
|
||||
d_model=1024,
|
||||
dropout=0.1,
|
||||
max_position_embeddings=1024,
|
||||
attention_dropout=0.0,
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
classifier_dropout=0.0,
|
||||
num_labels=3,
|
||||
is_encoder_decoder=True,
|
||||
normalize_before=False,
|
||||
add_final_layer_norm=False,
|
||||
do_blenderbot_90_layernorm=False,
|
||||
scale_embedding=False,
|
||||
normalize_embedding=True,
|
||||
static_position_embeddings=False,
|
||||
add_bias_logits=False,
|
||||
gradient_checkpointing=False,
|
||||
force_bos_token_to_be_generated=False,
|
||||
use_cache=True,
|
||||
num_labels=3,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
**common_kwargs
|
||||
is_encoder_decoder=True,
|
||||
decoder_start_token_id=2,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
:class:`~transformers.BartConfig` is the configuration class for `BartModel`.
|
||||
|
||||
Examples::
|
||||
|
||||
>>> from transformers import BartConfig, BartModel
|
||||
|
||||
>>> config = BartConfig.from_pretrained('facebook/bart-large')
|
||||
>>> model = BartModel(config)
|
||||
|
||||
"""
|
||||
if "hidden_size" in common_kwargs:
|
||||
raise ValueError("hidden size is called d_model")
|
||||
super().__init__(
|
||||
num_labels=num_labels,
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
**common_kwargs,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.d_model = d_model # encoder_embed_dim and decoder_embed_dim
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
self.encoder_ffn_dim = encoder_ffn_dim
|
||||
self.encoder_layers = self.num_hidden_layers = encoder_layers
|
||||
self.encoder_layers = encoder_layers
|
||||
self.encoder_attention_heads = encoder_attention_heads
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.decoder_ffn_dim = decoder_ffn_dim
|
||||
self.decoder_layers = decoder_layers
|
||||
self.decoder_attention_heads = decoder_attention_heads
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.init_std = init_std # Normal(0, this parameter)
|
||||
self.activation_function = activation_function
|
||||
|
||||
# Params introduced for Mbart
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
self.normalize_embedding = normalize_embedding # True for mbart, False otherwise
|
||||
self.normalize_before = normalize_before # combo of fairseq's encoder_ and decoder_normalize_before
|
||||
self.add_final_layer_norm = add_final_layer_norm
|
||||
|
||||
# Params introduced for Marian
|
||||
self.add_bias_logits = add_bias_logits
|
||||
self.static_position_embeddings = static_position_embeddings
|
||||
|
||||
# 3 Types of Dropout
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation_dropout = activation_dropout
|
||||
self.dropout = dropout
|
||||
|
||||
# Classifier stuff
|
||||
self.activation_function = activation_function
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
|
||||
# pos embedding offset
|
||||
self.extra_pos_embeddings = extra_pos_embeddings
|
||||
# bart has a hack that offsets positional embeddings by 2, other models don't do this
|
||||
|
||||
self.force_bos_token_to_be_generated = force_bos_token_to_be_generated
|
||||
|
||||
self.do_blenderbot_90_layernorm = do_blenderbot_90_layernorm
|
||||
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
self.force_bos_token_to_be_generated = force_bos_token_to_be_generated # only relevant for CNN
|
||||
|
||||
# IMPORTANT
|
||||
# DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
|
||||
self.extra_pos_embeddings = 2
|
||||
self.normalize_before = False
|
||||
self.add_final_layer_norm = False
|
||||
self.do_blenderbot_90_layernorm = False
|
||||
self.normalize_embedding = True
|
||||
self.static_position_embeddings = False
|
||||
self.add_bias_logits = False
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
@@ -220,11 +187,3 @@ class BartConfig(PretrainedConfig):
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
|
||||
def is_valid_mbart(self) -> bool:
|
||||
"""Is the configuration aligned with the MBART paper."""
|
||||
if self.normalize_before and self.add_final_layer_norm and self.scale_embedding:
|
||||
return True
|
||||
if self.normalize_before or self.add_final_layer_norm or self.scale_embedding:
|
||||
logger.info("This configuration is a mixture of MBART and BART settings")
|
||||
return False
|
||||
|
||||
396
src/transformers/models/bart/modeling_bart.py
Normal file → Executable file
396
src/transformers/models/bart/modeling_bart.py
Normal file → Executable file
@@ -1,5 +1,5 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -12,17 +12,18 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch BART model, ported from the fairseq repo."""
|
||||
""" PyTorch BART model. """
|
||||
|
||||
|
||||
import math
|
||||
import random
|
||||
import warnings
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from torch.nn import CrossEntropyLoss, LayerNorm
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from ...activations import ACT2FN
|
||||
from ...file_utils import (
|
||||
@@ -52,32 +53,24 @@ _TOKENIZER_FOR_DOC = "BartTokenizer"
|
||||
|
||||
|
||||
BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
"facebook/bart-base",
|
||||
"facebook/bart-large",
|
||||
"facebook/bart-large-mnli",
|
||||
"facebook/bart-large-cnn",
|
||||
"facebook/bart-large-xsum",
|
||||
"facebook/mbart-large-en-ro",
|
||||
# See all BART models at https://huggingface.co/models?filter=bart
|
||||
]
|
||||
# This list is incomplete. See all BART models at https://huggingface.co/models?filter=bart
|
||||
|
||||
|
||||
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
|
||||
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
|
||||
"""
|
||||
Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).
|
||||
Shift input ids one token to the right.
|
||||
"""
|
||||
prev_output_tokens = input_ids.clone()
|
||||
shifted_input_ids = input_ids.new_zeros(input_ids.shape)
|
||||
shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
|
||||
shifted_input_ids[:, 0] = decoder_start_token_id
|
||||
|
||||
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
|
||||
# replace possible -100 values in labels by `pad_token_id`
|
||||
prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
|
||||
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
|
||||
|
||||
index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
|
||||
decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
|
||||
prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
|
||||
prev_output_tokens[:, 0] = decoder_start_tokens
|
||||
|
||||
return prev_output_tokens
|
||||
return shifted_input_ids
|
||||
|
||||
|
||||
def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
|
||||
@@ -111,18 +104,15 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
|
||||
|
||||
class BartLearnedPositionalEmbedding(nn.Embedding):
|
||||
"""
|
||||
This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
|
||||
based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
|
||||
the forward function.
|
||||
This module learns positional embeddings up to a fixed maximum size.
|
||||
"""
|
||||
|
||||
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset: int):
|
||||
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
|
||||
assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
|
||||
# Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
|
||||
# and adjust num_embeddings appropriately. Other models dont have this hack
|
||||
self.offset = offset
|
||||
assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
|
||||
num_embeddings += offset
|
||||
super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
|
||||
self.offset = 2
|
||||
super().__init__(num_embeddings + self.offset, embedding_dim, padding_idx=padding_idx)
|
||||
|
||||
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
|
||||
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
|
||||
@@ -133,40 +123,6 @@ class BartLearnedPositionalEmbedding(nn.Embedding):
|
||||
return super().forward(positions + self.offset)
|
||||
|
||||
|
||||
class BartSinusoidalPositionalEmbedding(nn.Embedding):
|
||||
"""This module produces sinusoidal positional embeddings of any length."""
|
||||
|
||||
def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
|
||||
super().__init__(num_positions, embedding_dim)
|
||||
self.weight = self._init_weight(self.weight)
|
||||
|
||||
@staticmethod
|
||||
def _init_weight(out: nn.Parameter):
|
||||
"""
|
||||
Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
|
||||
the 2nd half of the vector. [dim // 2:]
|
||||
"""
|
||||
n_pos, dim = out.shape
|
||||
position_enc = np.array(
|
||||
[[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
|
||||
)
|
||||
out.requires_grad = False # set early to avoid an error in pytorch-1.8+
|
||||
sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
|
||||
out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
|
||||
out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
|
||||
out.detach_()
|
||||
return out
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
|
||||
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
|
||||
bsz, seq_len = input_ids_shape[:2]
|
||||
positions = torch.arange(
|
||||
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
|
||||
)
|
||||
return super().forward(positions)
|
||||
|
||||
|
||||
class BartAttention(nn.Module):
|
||||
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
||||
|
||||
@@ -310,14 +266,13 @@ class BartEncoderLayer(nn.Module):
|
||||
num_heads=config.encoder_attention_heads,
|
||||
dropout=config.attention_dropout,
|
||||
)
|
||||
self.normalize_before = config.normalize_before
|
||||
self.self_attn_layer_norm = LayerNorm(self.embed_dim)
|
||||
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
|
||||
self.dropout = config.dropout
|
||||
self.activation_fn = ACT2FN[config.activation_function]
|
||||
self.activation_dropout = config.activation_dropout
|
||||
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
|
||||
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
|
||||
self.final_layer_norm = LayerNorm(self.embed_dim)
|
||||
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
|
||||
"""
|
||||
@@ -325,33 +280,36 @@ class BartEncoderLayer(nn.Module):
|
||||
hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
|
||||
attention_mask (:obj:`torch.FloatTensor`): attention mask of size
|
||||
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
|
||||
output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.
|
||||
output_attentions (:obj:`bool`, `optional`):
|
||||
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
|
||||
returned tensors for more detail.
|
||||
"""
|
||||
residual = hidden_states
|
||||
if self.normalize_before:
|
||||
hidden_states = self.self_attn_layer_norm(hidden_states)
|
||||
hidden_states, attn_weights, _ = self.self_attn(
|
||||
hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
|
||||
)
|
||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
hidden_states = residual + hidden_states
|
||||
if not self.normalize_before:
|
||||
hidden_states = self.self_attn_layer_norm(hidden_states)
|
||||
hidden_states = self.self_attn_layer_norm(hidden_states)
|
||||
|
||||
residual = hidden_states
|
||||
if self.normalize_before:
|
||||
hidden_states = self.final_layer_norm(hidden_states)
|
||||
hidden_states = self.activation_fn(self.fc1(hidden_states))
|
||||
hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
|
||||
hidden_states = self.fc2(hidden_states)
|
||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
hidden_states = residual + hidden_states
|
||||
if not self.normalize_before:
|
||||
hidden_states = self.final_layer_norm(hidden_states)
|
||||
hidden_states = self.final_layer_norm(hidden_states)
|
||||
|
||||
if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
|
||||
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
|
||||
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
|
||||
return hidden_states, attn_weights
|
||||
|
||||
outputs = (hidden_states,)
|
||||
|
||||
if output_attentions:
|
||||
outputs += (attn_weights,)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class BartDecoderLayer(nn.Module):
|
||||
@@ -368,19 +326,18 @@ class BartDecoderLayer(nn.Module):
|
||||
self.dropout = config.dropout
|
||||
self.activation_fn = ACT2FN[config.activation_function]
|
||||
self.activation_dropout = config.activation_dropout
|
||||
self.normalize_before = config.normalize_before
|
||||
|
||||
self.self_attn_layer_norm = LayerNorm(self.embed_dim)
|
||||
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
|
||||
self.encoder_attn = BartAttention(
|
||||
self.embed_dim,
|
||||
config.decoder_attention_heads,
|
||||
dropout=config.attention_dropout,
|
||||
is_decoder=True,
|
||||
)
|
||||
self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
|
||||
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
|
||||
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
|
||||
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
|
||||
self.final_layer_norm = LayerNorm(self.embed_dim)
|
||||
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@@ -389,7 +346,8 @@ class BartDecoderLayer(nn.Module):
|
||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||
encoder_attention_mask: Optional[torch.Tensor] = None,
|
||||
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
||||
output_attentions: Optional[torch.Tensor] = False,
|
||||
output_attentions: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = True,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
@@ -400,11 +358,11 @@ class BartDecoderLayer(nn.Module):
|
||||
encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
|
||||
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
|
||||
past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
|
||||
output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.
|
||||
output_attentions (:obj:`bool`, `optional`):
|
||||
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
|
||||
returned tensors for more detail.
|
||||
"""
|
||||
residual = hidden_states
|
||||
if self.normalize_before:
|
||||
hidden_states = self.self_attn_layer_norm(hidden_states)
|
||||
|
||||
# Self Attention
|
||||
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
|
||||
@@ -418,16 +376,13 @@ class BartDecoderLayer(nn.Module):
|
||||
)
|
||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
hidden_states = residual + hidden_states
|
||||
if not self.normalize_before:
|
||||
hidden_states = self.self_attn_layer_norm(hidden_states)
|
||||
hidden_states = self.self_attn_layer_norm(hidden_states)
|
||||
|
||||
# Cross-Attention Block
|
||||
cross_attn_present_key_value = None
|
||||
cross_attn_weights = None
|
||||
if encoder_hidden_states is not None:
|
||||
residual = hidden_states
|
||||
if self.normalize_before:
|
||||
hidden_states = self.encoder_attn_layer_norm(hidden_states)
|
||||
|
||||
# cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
|
||||
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
|
||||
@@ -440,30 +395,29 @@ class BartDecoderLayer(nn.Module):
|
||||
)
|
||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
hidden_states = residual + hidden_states
|
||||
if not self.normalize_before:
|
||||
hidden_states = self.encoder_attn_layer_norm(hidden_states)
|
||||
hidden_states = self.encoder_attn_layer_norm(hidden_states)
|
||||
|
||||
# add cross-attn to positions 3,4 of present_key_value tuple
|
||||
present_key_value = present_key_value + cross_attn_present_key_value
|
||||
|
||||
# Fully Connected
|
||||
residual = hidden_states
|
||||
if self.normalize_before:
|
||||
hidden_states = self.final_layer_norm(hidden_states)
|
||||
hidden_states = self.activation_fn(self.fc1(hidden_states))
|
||||
hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
|
||||
hidden_states = self.fc2(hidden_states)
|
||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
hidden_states = residual + hidden_states
|
||||
if not self.normalize_before:
|
||||
hidden_states = self.final_layer_norm(hidden_states)
|
||||
hidden_states = self.final_layer_norm(hidden_states)
|
||||
|
||||
return (
|
||||
hidden_states,
|
||||
self_attn_weights,
|
||||
present_key_value,
|
||||
cross_attn_weights,
|
||||
)
|
||||
outputs = (hidden_states,)
|
||||
|
||||
if output_attentions:
|
||||
outputs += (self_attn_weights, cross_attn_weights)
|
||||
|
||||
if use_cache:
|
||||
outputs += (present_key_value,)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
class BartClassificationHead(nn.Module):
|
||||
@@ -500,8 +454,6 @@ class BartPretrainedModel(PreTrainedModel):
|
||||
module.weight.data.normal_(mean=0.0, std=std)
|
||||
if module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, BartSinusoidalPositionalEmbedding):
|
||||
pass
|
||||
elif isinstance(module, nn.Embedding):
|
||||
module.weight.data.normal_(mean=0.0, std=std)
|
||||
if module.padding_idx is not None:
|
||||
@@ -536,10 +488,10 @@ BART_START_DOCSTRING = r"""
|
||||
general usage and behavior.
|
||||
|
||||
Parameters:
|
||||
config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
|
||||
weights.
|
||||
config (:class:`~transformers.BartConfig`):
|
||||
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
||||
load the weights associated with the model, only the configuration. Check out the
|
||||
:meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
|
||||
"""
|
||||
|
||||
BART_GENERATION_EXAMPLE = r"""
|
||||
@@ -547,9 +499,8 @@ BART_GENERATION_EXAMPLE = r"""
|
||||
|
||||
>>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
|
||||
|
||||
>>> # see ``examples/summarization/bart/run_eval.py`` for a longer example
|
||||
>>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
|
||||
>>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
|
||||
>>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
|
||||
>>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
|
||||
|
||||
>>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
|
||||
>>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
|
||||
@@ -557,6 +508,22 @@ BART_GENERATION_EXAMPLE = r"""
|
||||
>>> # Generate Summary
|
||||
>>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
|
||||
>>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
|
||||
|
||||
Mask filling example::
|
||||
|
||||
>>> from transformers import BartTokenizer, BartForConditionalGeneration
|
||||
>>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
|
||||
>>> TXT = "My friends are <mask> but they eat too many carbs."
|
||||
|
||||
>>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
|
||||
>>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
|
||||
>>> logits = model(input_ids).logits
|
||||
|
||||
>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
|
||||
>>> probs = logits[0, masked_index].softmax(dim=0)
|
||||
>>> values, predictions = probs.topk(5)
|
||||
|
||||
>>> tokenizer.decode(predictions).split()
|
||||
"""
|
||||
|
||||
BART_INPUTS_DOCSTRING = r"""
|
||||
@@ -578,9 +545,22 @@ BART_INPUTS_DOCSTRING = r"""
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Provide for translation and summarization training. By default, the model will create this tensor by
|
||||
shifting the :obj:`input_ids` to the right, following the paper.
|
||||
decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
|
||||
Indices of decoder input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using :class:`~transformers.BartTokenizer`. See
|
||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||
details.
|
||||
|
||||
`What are input IDs? <../glossary.html#input-ids>`__
|
||||
|
||||
Bart uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
|
||||
:obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
|
||||
:obj:`past_key_values`).
|
||||
|
||||
For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
|
||||
:obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
|
||||
the right for denoising pre-training following the paper.
|
||||
decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
|
||||
also be used by default.
|
||||
|
||||
@@ -641,30 +621,22 @@ class BartEncoder(BartPretrainedModel):
|
||||
self.layerdrop = config.encoder_layerdrop
|
||||
|
||||
embed_dim = config.d_model
|
||||
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
|
||||
self.padding_idx = config.pad_token_id
|
||||
self.max_source_positions = config.max_position_embeddings
|
||||
self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
|
||||
|
||||
if embed_tokens is not None:
|
||||
self.embed_tokens = embed_tokens
|
||||
else:
|
||||
self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
|
||||
|
||||
if config.static_position_embeddings:
|
||||
self.embed_positions = BartSinusoidalPositionalEmbedding(
|
||||
config.max_position_embeddings, embed_dim, self.padding_idx
|
||||
)
|
||||
else:
|
||||
self.embed_positions = BartLearnedPositionalEmbedding(
|
||||
config.max_position_embeddings,
|
||||
embed_dim,
|
||||
self.padding_idx,
|
||||
config.extra_pos_embeddings,
|
||||
)
|
||||
self.embed_positions = BartLearnedPositionalEmbedding(
|
||||
config.max_position_embeddings,
|
||||
embed_dim,
|
||||
self.padding_idx,
|
||||
)
|
||||
self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
|
||||
self.layernorm_embedding = LayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
|
||||
# mbart has one extra layer_norm
|
||||
self.layer_norm = LayerNorm(config.d_model) if config.add_final_layer_norm else None
|
||||
self.layernorm_embedding = nn.LayerNorm(embed_dim)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
@@ -747,15 +719,28 @@ class BartEncoder(BartPretrainedModel):
|
||||
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
||||
dropout_probability = random.uniform(0, 1)
|
||||
if self.training and (dropout_probability < self.layerdrop): # skip the layer
|
||||
attn = None
|
||||
layer_outputs = (None, None)
|
||||
else:
|
||||
hidden_states, attn = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
|
||||
if getattr(self.config, "gradient_checkpointing", False):
|
||||
|
||||
def create_custom_forward(module):
|
||||
def custom_forward(*inputs):
|
||||
return module(*inputs, output_attentions)
|
||||
|
||||
return custom_forward
|
||||
|
||||
layer_outputs = torch.utils.checkpoint.checkpoint(
|
||||
create_custom_forward(encoder_layer),
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
)
|
||||
else:
|
||||
layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if output_attentions:
|
||||
all_attentions = all_attentions + (attn,)
|
||||
|
||||
if self.layer_norm:
|
||||
hidden_states = self.layer_norm(hidden_states)
|
||||
all_attentions = all_attentions + (layer_outputs[1],)
|
||||
|
||||
if output_hidden_states:
|
||||
encoder_states = encoder_states + (hidden_states,)
|
||||
@@ -780,7 +765,6 @@ class BartDecoder(BartPretrainedModel):
|
||||
super().__init__(config)
|
||||
self.dropout = config.dropout
|
||||
self.layerdrop = config.decoder_layerdrop
|
||||
self.do_blenderbot_90_layernorm = config.do_blenderbot_90_layernorm # layernorm variant
|
||||
self.padding_idx = config.pad_token_id
|
||||
self.max_target_positions = config.max_position_embeddings
|
||||
self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
|
||||
@@ -790,20 +774,13 @@ class BartDecoder(BartPretrainedModel):
|
||||
else:
|
||||
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
|
||||
|
||||
if config.static_position_embeddings:
|
||||
self.embed_positions = BartSinusoidalPositionalEmbedding(
|
||||
config.max_position_embeddings, config.d_model, config.pad_token_id
|
||||
)
|
||||
else:
|
||||
self.embed_positions = BartLearnedPositionalEmbedding(
|
||||
config.max_position_embeddings,
|
||||
config.d_model,
|
||||
self.padding_idx,
|
||||
config.extra_pos_embeddings,
|
||||
)
|
||||
self.embed_positions = BartLearnedPositionalEmbedding(
|
||||
config.max_position_embeddings,
|
||||
config.d_model,
|
||||
self.padding_idx,
|
||||
)
|
||||
self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
|
||||
self.layernorm_embedding = LayerNorm(config.d_model) if config.normalize_embedding else nn.Identity()
|
||||
self.layer_norm = LayerNorm(config.d_model) if config.add_final_layer_norm else None
|
||||
self.layernorm_embedding = nn.LayerNorm(config.d_model)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
@@ -902,33 +879,6 @@ class BartDecoder(BartPretrainedModel):
|
||||
input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
|
||||
).to(self.device)
|
||||
|
||||
# create decoder_padding_mask if not provided and needed
|
||||
# 4.12.20 (PVP): Not a fan of this "magical" function that
|
||||
# automatically creates attention_mask for padded tokens
|
||||
# => this is inconsistent with other models
|
||||
# => Pegasus uses the pad_token as decoder_start_token_id, so that this could
|
||||
# pose some problems.
|
||||
if (
|
||||
attention_mask is None
|
||||
and input_ids is not None
|
||||
and input_shape[-1] > 1
|
||||
and self.config.pad_token_id in input_ids
|
||||
):
|
||||
# should be kept for backwards compatibility
|
||||
attention_mask = input_ids.ne(self.config.pad_token_id).to(torch.long)
|
||||
# never mask leading token, even if it is pad
|
||||
attention_mask[:, 0] = attention_mask[:, 1]
|
||||
if past_key_values_length > 0:
|
||||
attention_mask = torch.cat(
|
||||
[
|
||||
torch.ones(
|
||||
(input_shape[0], past_key_values_length), dtype=torch.long, device=input_ids.device
|
||||
),
|
||||
attention_mask,
|
||||
],
|
||||
dim=-1,
|
||||
)
|
||||
|
||||
if attention_mask is not None and combined_attention_mask is not None:
|
||||
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
||||
combined_attention_mask = combined_attention_mask + _expand_mask(
|
||||
@@ -943,12 +893,8 @@ class BartDecoder(BartPretrainedModel):
|
||||
# embed positions
|
||||
positions = self.embed_positions(input_shape, past_key_values_length)
|
||||
|
||||
if self.do_blenderbot_90_layernorm:
|
||||
hidden_states = self.layernorm_embedding(inputs_embeds)
|
||||
hidden_states += positions
|
||||
else:
|
||||
hidden_states = inputs_embeds + positions
|
||||
hidden_states = self.layernorm_embedding(hidden_states)
|
||||
hidden_states = inputs_embeds + positions
|
||||
hidden_states = self.layernorm_embedding(hidden_states)
|
||||
|
||||
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
|
||||
|
||||
@@ -967,30 +913,51 @@ class BartDecoder(BartPretrainedModel):
|
||||
|
||||
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||
|
||||
hidden_states, layer_self_attn, present_key_value, layer_cross_attn = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=combined_attention_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
if getattr(self.config, "gradient_checkpointing", False):
|
||||
if use_cache:
|
||||
raise ValueError(
|
||||
"When using `gradient_checkpointing, make sure that `use_cache=False` and `config.use_cache=False`."
|
||||
)
|
||||
|
||||
def create_custom_forward(module):
|
||||
def custom_forward(*inputs):
|
||||
# None for past_key_value
|
||||
return module(*inputs, output_attentions, use_cache)
|
||||
|
||||
return custom_forward
|
||||
|
||||
layer_outputs = torch.utils.checkpoint.checkpoint(
|
||||
create_custom_forward(decoder_layer),
|
||||
hidden_states,
|
||||
combined_attention_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
None,
|
||||
)
|
||||
else:
|
||||
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=combined_attention_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
)
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if use_cache:
|
||||
next_decoder_cache += (present_key_value,)
|
||||
next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
|
||||
|
||||
if output_attentions:
|
||||
all_self_attns += (layer_self_attn,)
|
||||
all_cross_attentions += (layer_cross_attn,)
|
||||
all_self_attns += (layer_outputs[1],)
|
||||
all_cross_attentions += (layer_outputs[2],)
|
||||
|
||||
# add hidden states from the last decoder layer
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
# if config.add_final_layer_norm (mBART)
|
||||
if self.layer_norm:
|
||||
hidden_states = self.layer_norm(hidden_states)
|
||||
|
||||
next_cache = next_decoder_cache if use_cache else None
|
||||
if not return_dict:
|
||||
return tuple(
|
||||
@@ -1060,12 +1027,12 @@ class BartModel(BartPretrainedModel):
|
||||
return_dict=None,
|
||||
):
|
||||
|
||||
# 4.12.20 (PVP): Not a fan of this "magical" function and
|
||||
# also wonder how often it's actually used ... keep now
|
||||
# for backward compatibility
|
||||
# -> is this used for backward compatibility
|
||||
# different to other models, Bart automatically creates decoder_input_ids from
|
||||
# input_ids if no decoder_input_ids are provided
|
||||
if decoder_input_ids is None and decoder_inputs_embeds is None:
|
||||
decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id)
|
||||
decoder_input_ids = shift_tokens_right(
|
||||
input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
|
||||
)
|
||||
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
@@ -1083,7 +1050,7 @@ class BartModel(BartPretrainedModel):
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
|
||||
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
|
||||
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
|
||||
encoder_outputs = BaseModelOutput(
|
||||
last_hidden_state=encoder_outputs[0],
|
||||
@@ -1192,31 +1159,14 @@ class BartForConditionalGeneration(BartPretrainedModel):
|
||||
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
|
||||
|
||||
Returns:
|
||||
|
||||
Conditional generation example::
|
||||
|
||||
>>> # Mask filling only works for bart-large
|
||||
>>> from transformers import BartTokenizer, BartForConditionalGeneration
|
||||
>>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
|
||||
>>> TXT = "My friends are <mask> but they eat too many carbs."
|
||||
|
||||
>>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
|
||||
>>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
|
||||
>>> logits = model(input_ids).logits
|
||||
|
||||
>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
|
||||
>>> probs = logits[0, masked_index].softmax(dim=0)
|
||||
>>> values, predictions = probs.topk(5)
|
||||
|
||||
>>> tokenizer.decode(predictions).split()
|
||||
>>> # ['good', 'great', 'all', 'really', 'very']
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if labels is not None:
|
||||
use_cache = False
|
||||
if decoder_input_ids is None:
|
||||
decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
|
||||
decoder_input_ids = shift_tokens_right(
|
||||
labels, self.config.pad_token_id, self.config.decoder_start_token_id
|
||||
)
|
||||
|
||||
outputs = self.model(
|
||||
input_ids,
|
||||
@@ -1237,7 +1187,6 @@ class BartForConditionalGeneration(BartPretrainedModel):
|
||||
masked_lm_loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
# TODO(SS): do we need to ignore pad tokens in labels?
|
||||
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
|
||||
if not return_dict:
|
||||
@@ -1288,7 +1237,10 @@ class BartForConditionalGeneration(BartPretrainedModel):
|
||||
def _reorder_cache(past, beam_idx):
|
||||
reordered_past = ()
|
||||
for layer_past in past:
|
||||
reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
|
||||
# cached cross_attention states don't have to be reordered -> they are always the same
|
||||
reordered_past += (
|
||||
tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
|
||||
)
|
||||
return reordered_past
|
||||
|
||||
|
||||
|
||||
@@ -545,7 +545,7 @@ BART_INPUTS_DOCSTRING = r"""
|
||||
decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Provide for translation and summarization training. By default, the model will create this tensor by
|
||||
shifting the input_ids right, following the paper.
|
||||
decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
|
||||
decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
|
||||
encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
|
||||
hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
|
||||
from ...file_utils import is_tf_available, is_torch_available
|
||||
from .configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig
|
||||
from .tokenization_blenderbot import BlenderbotSmallTokenizer, BlenderbotTokenizer
|
||||
from .tokenization_blenderbot import BlenderbotTokenizer
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
@@ -26,7 +26,9 @@ if is_torch_available():
|
||||
BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
BlenderbotForConditionalGeneration,
|
||||
BlenderbotModel,
|
||||
BlenderbotPreTrainedModel,
|
||||
)
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
# Copyright (c) Facebook, Inc. and Huggingface, 2020
|
||||
# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the;
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
@@ -13,46 +12,49 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
"""
|
||||
BlenderbotConfig has the same signature as BartConfig. We only rewrite the signature in order to document
|
||||
blenderbot-90M defaults.
|
||||
"""
|
||||
from ..bart.configuration_bart import BartConfig
|
||||
""" Blenderbot model configuration """
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"facebook/blenderbot-3B": "https://cdn.huggingface.co/facebook/blenderbot-3B/config.json",
|
||||
"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/config.json",
|
||||
"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/config.json",
|
||||
# See all Blenderbot models at https://huggingface.co/models?filter=blenderbot
|
||||
}
|
||||
|
||||
|
||||
class BlenderbotConfig(BartConfig):
|
||||
class BlenderbotConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a
|
||||
:class:`~transformers.BlenderbotForConditionalGeneration`. It inherits from :class:`~transformers.BartConfig` and
|
||||
has the same signature with different defaults.
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotModel`. It is used
|
||||
to instantiate an Blenderbot model according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar configuration to that of the Blenderbot
|
||||
`facebook/blenderbot-3B <https://huggingface.co/facebook/blenderbot-3B>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 54944):
|
||||
Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotForConditionalGeneration`.
|
||||
d_model (:obj:`int`, `optional`, defaults to 512):
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 50265):
|
||||
Vocabulary size of the Blenderbot model. Defines the number of different tokens that can be represented by
|
||||
the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotModel` or
|
||||
:class:`~transformers.TFBlenderbotModel`.
|
||||
d_model (:obj:`int`, `optional`, defaults to 1024):
|
||||
Dimensionality of the layers and the pooler layer.
|
||||
encoder_layers (:obj:`int`, `optional`, defaults to 8):
|
||||
Number of encoder layers, 6 are used for the `blenderbot-90M` model.
|
||||
decoder_layers (:obj:`int`, `optional`, defaults to 8):
|
||||
Number of decoder layers, 6 are used for the `blenderbot-90M` model.
|
||||
encoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of encoder layers.
|
||||
decoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of decoder layers.
|
||||
encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
|
||||
decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
|
||||
encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
@@ -65,117 +67,115 @@ class BlenderbotConfig(BartConfig):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for classifier.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
init_std (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
This should be completed, specific to marian.
|
||||
normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Call layernorm before attention ops.
|
||||
normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Call layernorm after embeddings.
|
||||
static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Don't learn positional embeddings, use sinusoidal.
|
||||
add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Why not add another layernorm?
|
||||
do_blenderbot_90_layernorm (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Blenderbot-90m checkpoint uses `layernorm_embedding` one line earlier in the decoder.
|
||||
scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Scale embeddings by diving by sqrt(d_model).
|
||||
eos_token_id (:obj:`int`, `optional`, defaults to 2)
|
||||
End of stream token id.
|
||||
pad_token_id (:obj:`int`, `optional`, defaults to 1)
|
||||
Padding token id.
|
||||
bos_token_id (:obj:`int`, `optional`, defaults to 0)
|
||||
Beginning of stream token id.
|
||||
encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
|
||||
How many extra learned positional embeddings to use. Should be set to :obj:`pad_token_id+1`.
|
||||
is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether this is an encoder/decoder model.
|
||||
force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``),
|
||||
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||
scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Scale embeddings by diving by sqrt(d_model).
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models)
|
||||
|
||||
Example::
|
||||
|
||||
>>> from transformers import BlenderbotModel, BlenderbotConfig
|
||||
|
||||
>>> # Initializing a Blenderbot facebook/blenderbot-3B style configuration
|
||||
>>> configuration = BlenderbotConfig()
|
||||
|
||||
>>> # Initializing a model from the facebook/blenderbot-3B style configuration
|
||||
>>> model = BlenderbotModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "blenderbot"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
activation_dropout=0.0,
|
||||
extra_pos_embeddings=0,
|
||||
activation_function="gelu",
|
||||
vocab_size=54944,
|
||||
d_model=512,
|
||||
encoder_ffn_dim=2048,
|
||||
encoder_layers=8,
|
||||
encoder_attention_heads=16,
|
||||
decoder_ffn_dim=2048,
|
||||
decoder_layers=8,
|
||||
decoder_attention_heads=16,
|
||||
vocab_size=8008,
|
||||
max_position_embeddings=128,
|
||||
encoder_layers=2,
|
||||
encoder_ffn_dim=10240,
|
||||
encoder_attention_heads=32,
|
||||
decoder_layers=24,
|
||||
decoder_ffn_dim=10240,
|
||||
decoder_attention_heads=32,
|
||||
encoder_layerdrop=0.0,
|
||||
decoder_layerdrop=0.0,
|
||||
attention_dropout=0.0,
|
||||
dropout=0.1,
|
||||
max_position_embeddings=512,
|
||||
classifier_dropout=0.0,
|
||||
use_cache=True,
|
||||
is_encoder_decoder=True,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
normalize_before=False,
|
||||
add_final_layer_norm=False,
|
||||
do_blenderbot_90_layernorm=True,
|
||||
activation_function="gelu",
|
||||
d_model=2560,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.0,
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
decoder_start_token_id=1,
|
||||
classifier_dropout=0.0,
|
||||
scale_embedding=False,
|
||||
normalize_embedding=True,
|
||||
static_position_embeddings=False,
|
||||
add_bias_logits=False,
|
||||
force_bos_token_to_be_generated=False,
|
||||
**common_kwargs
|
||||
gradient_checkpointing=False,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
Examples::
|
||||
|
||||
>>> from transformers import BlenderbotConfig
|
||||
>>> config = BlenderbotConfig.from_pretrained('facebook/blenderbot-90M')
|
||||
|
||||
"""
|
||||
if "hidden_size" in common_kwargs:
|
||||
raise ValueError("hidden size is called d_model")
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
vocab_size=vocab_size,
|
||||
d_model=d_model,
|
||||
encoder_ffn_dim=encoder_ffn_dim,
|
||||
encoder_layers=encoder_layers,
|
||||
encoder_layerdrop=encoder_layerdrop,
|
||||
encoder_attention_heads=encoder_attention_heads,
|
||||
decoder_layerdrop=decoder_layerdrop,
|
||||
decoder_ffn_dim=decoder_ffn_dim,
|
||||
decoder_layers=decoder_layers,
|
||||
normalize_before=normalize_before,
|
||||
normalize_embedding=normalize_embedding,
|
||||
static_position_embeddings=static_position_embeddings,
|
||||
add_bias_logits=add_bias_logits,
|
||||
force_bos_token_to_be_generated=force_bos_token_to_be_generated,
|
||||
do_blenderbot_90_layernorm=do_blenderbot_90_layernorm,
|
||||
add_final_layer_norm=add_final_layer_norm,
|
||||
scale_embedding=scale_embedding,
|
||||
attention_dropout=attention_dropout,
|
||||
dropout=dropout,
|
||||
classifier_dropout=classifier_dropout,
|
||||
activation_dropout=activation_dropout,
|
||||
max_position_embeddings=max_position_embeddings,
|
||||
extra_pos_embeddings=extra_pos_embeddings,
|
||||
activation_function=activation_function,
|
||||
decoder_attention_heads=decoder_attention_heads,
|
||||
**common_kwargs,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
self.encoder_ffn_dim = encoder_ffn_dim
|
||||
self.encoder_layers = encoder_layers
|
||||
self.encoder_attention_heads = encoder_attention_heads
|
||||
self.decoder_ffn_dim = decoder_ffn_dim
|
||||
self.decoder_layers = decoder_layers
|
||||
self.decoder_attention_heads = decoder_attention_heads
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation_dropout = activation_dropout
|
||||
self.activation_function = activation_function
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
# IMPORTANT
|
||||
# DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
|
||||
self.extra_pos_embeddings = 0
|
||||
self.normalize_before = True
|
||||
self.add_final_layer_norm = True
|
||||
self.do_blenderbot_90_layernorm = True
|
||||
self.normalize_embedding = False
|
||||
self.static_position_embeddings = False
|
||||
self.add_bias_logits = False
|
||||
self.force_bos_token_to_be_generated = False
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
|
||||
1215
src/transformers/models/blenderbot/modeling_blenderbot.py
Normal file → Executable file
1215
src/transformers/models/blenderbot/modeling_blenderbot.py
Normal file → Executable file
File diff suppressed because it is too large
Load Diff
@@ -1,8 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
# Copyright (c) Facebook, Inc. and its affiliates.
|
||||
# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the MIT license found in the;
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
@@ -13,15 +12,10 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# LICENSE file in the root directory of this source tree.
|
||||
""""BlenderbotTokenizer and BlenderbotSmallTokenizer"""
|
||||
import json
|
||||
import os
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
"""Tokenization class for Blenderbot."""
|
||||
|
||||
import regex as re
|
||||
from typing import List
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...utils import logging
|
||||
from ..roberta.tokenization_roberta import RobertaTokenizer
|
||||
|
||||
@@ -93,177 +87,3 @@ def get_pairs(word):
|
||||
|
||||
pairs = set(pairs)
|
||||
return pairs
|
||||
|
||||
|
||||
class BlenderbotSmallTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding)
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
|
||||
Users should refer to the superclass for more information regarding methods.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
File containing the vocabulary.
|
||||
merges_file (:obj:`str`):
|
||||
Path to the merges file.
|
||||
bos_token (:obj:`str`, `optional`, defaults to :obj:`"__start__"`):
|
||||
The beginning of sentence token.
|
||||
eos_token (:obj:`str`, `optional`, defaults to :obj:`"__end__"`):
|
||||
The end of sentence token.
|
||||
unk_token (:obj:`str`, `optional`, defaults to :obj:`"__unk__"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
pad_token (:obj:`str`, `optional`, defaults to :obj:`"__pad__"`):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
**kwargs
|
||||
Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
|
||||
"""
|
||||
|
||||
vocab_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
|
||||
pretrained_vocab_files_map = {
|
||||
"vocab_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/vocab.json"},
|
||||
"merges_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/merges.txt"},
|
||||
}
|
||||
max_model_input_sizes = {"facebook/blenderbot-90M": 512}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
merges_file,
|
||||
bos_token="__start__",
|
||||
eos_token="__end__",
|
||||
unk_token="__unk__",
|
||||
pad_token="__null__",
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
with open(merges_file, encoding="utf-8") as merges_handle:
|
||||
merges = merges_handle.read().split("\n")[1:-1]
|
||||
merges = [tuple(merge.split()) for merge in merges]
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self.encoder)
|
||||
|
||||
def get_vocab(self) -> Dict:
|
||||
return dict(self.encoder, **self.added_tokens_encoder)
|
||||
|
||||
def bpe(self, token: str) -> str:
|
||||
if token in self.cache:
|
||||
return self.cache[token]
|
||||
token = re.sub("([.,!?()])", r" \1", token)
|
||||
token = re.sub("(')", r" \1 ", token)
|
||||
token = re.sub(r"\s{2,}", " ", token)
|
||||
if "\n" in token:
|
||||
token = token.replace("\n", " __newln__")
|
||||
|
||||
tokens = token.split(" ")
|
||||
words = []
|
||||
for token in tokens:
|
||||
if not len(token):
|
||||
continue
|
||||
|
||||
token = token.lower()
|
||||
word = tuple(token)
|
||||
word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
|
||||
pairs = get_pairs(word)
|
||||
|
||||
if not pairs:
|
||||
words.append(token)
|
||||
continue
|
||||
|
||||
while True:
|
||||
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
|
||||
if bigram not in self.bpe_ranks:
|
||||
break
|
||||
first, second = bigram
|
||||
new_word = []
|
||||
i = 0
|
||||
|
||||
while i < len(word):
|
||||
try:
|
||||
j = word.index(first, i)
|
||||
new_word.extend(word[i:j])
|
||||
i = j
|
||||
except ValueError:
|
||||
new_word.extend(word[i:])
|
||||
break
|
||||
|
||||
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
|
||||
new_word.append(first + second)
|
||||
i += 2
|
||||
else:
|
||||
new_word.append(word[i])
|
||||
i += 1
|
||||
new_word = tuple(new_word)
|
||||
word = new_word
|
||||
if len(word) == 1:
|
||||
break
|
||||
else:
|
||||
pairs = get_pairs(word)
|
||||
word = "@@ ".join(word)
|
||||
word = word[:-4]
|
||||
|
||||
self.cache[token] = word
|
||||
words.append(word)
|
||||
return " ".join(words)
|
||||
|
||||
def _tokenize(self, text: str) -> List[str]:
|
||||
""" Split a string into tokens using BPE."""
|
||||
split_tokens = []
|
||||
|
||||
words = re.findall(r"\S+\n?", text)
|
||||
|
||||
for token in words:
|
||||
split_tokens.extend([t for t in self.bpe(token).split(" ")])
|
||||
return split_tokens
|
||||
|
||||
def _convert_token_to_id(self, token: str) -> int:
|
||||
""" Converts a token to an id using the vocab. """
|
||||
token = token.lower()
|
||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||
|
||||
def _convert_id_to_token(self, index: int) -> str:
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self.decoder.get(index, self.unk_token)
|
||||
|
||||
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
||||
""" Converts a sequence of tokens in a single string. """
|
||||
out_string = " ".join(tokens).replace("@@ ", "").strip()
|
||||
return out_string
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||
return
|
||||
vocab_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||
)
|
||||
merge_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
|
||||
)
|
||||
|
||||
with open(vocab_file, "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps(self.encoder, ensure_ascii=False))
|
||||
|
||||
index = 0
|
||||
with open(merge_file, "w", encoding="utf-8") as writer:
|
||||
writer.write("#version: 0.2\n")
|
||||
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
|
||||
if index != token_index:
|
||||
logger.warning(
|
||||
"Saving vocabulary to {}: BPE merge indices are not consecutive."
|
||||
" Please check that the tokenizer is not corrupted!".format(merge_file)
|
||||
)
|
||||
index = token_index
|
||||
writer.write(" ".join(bpe_tokens) + "\n")
|
||||
index += 1
|
||||
|
||||
return vocab_file, merge_file
|
||||
|
||||
29
src/transformers/models/blenderbot_small/__init__.py
Normal file
29
src/transformers/models/blenderbot_small/__init__.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from ...file_utils import is_torch_available
|
||||
from .configuration_blenderbot_small import BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotSmallConfig
|
||||
from .tokenization_blenderbot_small import BlenderbotSmallTokenizer
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
from .modeling_blenderbot_small import (
|
||||
BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
BlenderbotSmallForConditionalGeneration,
|
||||
BlenderbotSmallModel,
|
||||
BlenderbotSmallPreTrainedModel,
|
||||
)
|
||||
@@ -0,0 +1,170 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" BlenderbotSmall model configuration """
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/config.json",
|
||||
# See all BlenderbotSmall models at https://huggingface.co/models?filter=blenderbot_small
|
||||
}
|
||||
|
||||
|
||||
class BlenderbotSmallConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotSmallModel`. It is
|
||||
used to instantiate an BlenderbotSmall model according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar configuration to that of the BlenderbotSmall
|
||||
`facebook/blenderbot_small-90M <https://huggingface.co/facebook/blenderbot_small-90M>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 50265):
|
||||
Vocabulary size of the BlenderbotSmall model. Defines the number of different tokens that can be
|
||||
represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotSmallModel` or
|
||||
:class:`~transformers.TFBlenderbotSmallModel`.
|
||||
d_model (:obj:`int`, `optional`, defaults to 512):
|
||||
Dimensionality of the layers and the pooler layer.
|
||||
encoder_layers (:obj:`int`, `optional`, defaults to 8):
|
||||
Number of encoder layers.
|
||||
decoder_layers (:obj:`int`, `optional`, defaults to 8):
|
||||
Number of decoder layers.
|
||||
encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
|
||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for classifier.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
init_std (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||
scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Scale embeddings by diving by sqrt(d_model).
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models)
|
||||
|
||||
Example::
|
||||
|
||||
>>> from transformers import BlenderbotSmallModel, BlenderbotSmallConfig
|
||||
|
||||
>>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
|
||||
>>> configuration = BlenderbotSmallConfig()
|
||||
|
||||
>>> # Initializing a model from the facebook/blenderbot_small-90M style configuration
|
||||
>>> model = BlenderbotSmallModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "blenderbot-small"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50265,
|
||||
max_position_embeddings=512,
|
||||
encoder_layers=8,
|
||||
encoder_ffn_dim=2048,
|
||||
encoder_attention_heads=16,
|
||||
decoder_layers=8,
|
||||
decoder_ffn_dim=2048,
|
||||
decoder_attention_heads=16,
|
||||
encoder_layerdrop=0.0,
|
||||
decoder_layerdrop=0.0,
|
||||
use_cache=True,
|
||||
is_encoder_decoder=True,
|
||||
activation_function="gelu",
|
||||
d_model=512,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.0,
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
decoder_start_token_id=1,
|
||||
classifier_dropout=0.0,
|
||||
scale_embedding=False,
|
||||
gradient_checkpointing=False,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
self.encoder_ffn_dim = encoder_ffn_dim
|
||||
self.encoder_layers = encoder_layers
|
||||
self.encoder_attention_heads = encoder_attention_heads
|
||||
self.decoder_ffn_dim = decoder_ffn_dim
|
||||
self.decoder_layers = decoder_layers
|
||||
self.decoder_attention_heads = decoder_attention_heads
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation_dropout = activation_dropout
|
||||
self.activation_function = activation_function
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
1209
src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
Executable file
1209
src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
Executable file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,228 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization class for BlenderbotSmall."""
|
||||
|
||||
import json
|
||||
import os
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import regex as re
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
VOCAB_FILES_NAMES = {
|
||||
"vocab_file": "vocab.json",
|
||||
"merges_file": "merges.txt",
|
||||
# "tokenizer_config_file": "tokenizer_config.json",
|
||||
}
|
||||
|
||||
|
||||
def get_pairs(word):
|
||||
"""
|
||||
Return set of symbol pairs in a word.
|
||||
|
||||
Word is represented as tuple of symbols (symbols being variable-length strings).
|
||||
"""
|
||||
pairs = set()
|
||||
prev_char = word[0]
|
||||
for char in word[1:]:
|
||||
pairs.add((prev_char, char))
|
||||
prev_char = char
|
||||
|
||||
pairs = set(pairs)
|
||||
return pairs
|
||||
|
||||
|
||||
class BlenderbotSmallTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding)
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
|
||||
Users should refer to the superclass for more information regarding methods.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
File containing the vocabulary.
|
||||
merges_file (:obj:`str`):
|
||||
Path to the merges file.
|
||||
bos_token (:obj:`str`, `optional`, defaults to :obj:`"__start__"`):
|
||||
The beginning of sentence token.
|
||||
eos_token (:obj:`str`, `optional`, defaults to :obj:`"__end__"`):
|
||||
The end of sentence token.
|
||||
unk_token (:obj:`str`, `optional`, defaults to :obj:`"__unk__"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
pad_token (:obj:`str`, `optional`, defaults to :obj:`"__pad__"`):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
**kwargs
|
||||
Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
|
||||
"""
|
||||
|
||||
vocab_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
|
||||
pretrained_vocab_files_map = {
|
||||
"vocab_file": {
|
||||
"facebook/blenderbot_small-90M": "https://cdn.huggingface.co/facebook/blenderbot_small-90M/vocab.json"
|
||||
},
|
||||
"merges_file": {
|
||||
"facebook/blenderbot_small-90M": "https://cdn.huggingface.co/facebook/blenderbot_small-90M/merges.txt"
|
||||
},
|
||||
}
|
||||
max_model_input_sizes = {"facebook/blenderbot_small-90M": 512}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
merges_file,
|
||||
bos_token="__start__",
|
||||
eos_token="__end__",
|
||||
unk_token="__unk__",
|
||||
pad_token="__null__",
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
with open(merges_file, encoding="utf-8") as merges_handle:
|
||||
merges = merges_handle.read().split("\n")[1:-1]
|
||||
merges = [tuple(merge.split()) for merge in merges]
|
||||
self.bpe_ranks = dict(zip(merges, range(len(merges))))
|
||||
self.cache = {}
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self.encoder)
|
||||
|
||||
def get_vocab(self) -> Dict:
|
||||
return dict(self.encoder, **self.added_tokens_encoder)
|
||||
|
||||
def bpe(self, token: str) -> str:
|
||||
if token in self.cache:
|
||||
return self.cache[token]
|
||||
token = re.sub("([.,!?()])", r" \1", token)
|
||||
token = re.sub("(')", r" \1 ", token)
|
||||
token = re.sub(r"\s{2,}", " ", token)
|
||||
if "\n" in token:
|
||||
token = token.replace("\n", " __newln__")
|
||||
|
||||
tokens = token.split(" ")
|
||||
words = []
|
||||
for token in tokens:
|
||||
if not len(token):
|
||||
continue
|
||||
|
||||
token = token.lower()
|
||||
word = tuple(token)
|
||||
word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
|
||||
pairs = get_pairs(word)
|
||||
|
||||
if not pairs:
|
||||
words.append(token)
|
||||
continue
|
||||
|
||||
while True:
|
||||
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
|
||||
if bigram not in self.bpe_ranks:
|
||||
break
|
||||
first, second = bigram
|
||||
new_word = []
|
||||
i = 0
|
||||
|
||||
while i < len(word):
|
||||
try:
|
||||
j = word.index(first, i)
|
||||
new_word.extend(word[i:j])
|
||||
i = j
|
||||
except ValueError:
|
||||
new_word.extend(word[i:])
|
||||
break
|
||||
|
||||
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
|
||||
new_word.append(first + second)
|
||||
i += 2
|
||||
else:
|
||||
new_word.append(word[i])
|
||||
i += 1
|
||||
new_word = tuple(new_word)
|
||||
word = new_word
|
||||
if len(word) == 1:
|
||||
break
|
||||
else:
|
||||
pairs = get_pairs(word)
|
||||
word = "@@ ".join(word)
|
||||
word = word[:-4]
|
||||
|
||||
self.cache[token] = word
|
||||
words.append(word)
|
||||
return " ".join(words)
|
||||
|
||||
def _tokenize(self, text: str) -> List[str]:
|
||||
""" Split a string into tokens using BPE."""
|
||||
split_tokens = []
|
||||
|
||||
words = re.findall(r"\S+\n?", text)
|
||||
|
||||
for token in words:
|
||||
split_tokens.extend([t for t in self.bpe(token).split(" ")])
|
||||
return split_tokens
|
||||
|
||||
def _convert_token_to_id(self, token: str) -> int:
|
||||
""" Converts a token to an id using the vocab. """
|
||||
token = token.lower()
|
||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||
|
||||
def _convert_id_to_token(self, index: int) -> str:
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self.decoder.get(index, self.unk_token)
|
||||
|
||||
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
||||
""" Converts a sequence of tokens in a single string. """
|
||||
out_string = " ".join(tokens).replace("@@ ", "").strip()
|
||||
return out_string
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||
return
|
||||
vocab_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||
)
|
||||
merge_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
|
||||
)
|
||||
|
||||
with open(vocab_file, "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps(self.encoder, ensure_ascii=False))
|
||||
|
||||
index = 0
|
||||
with open(merge_file, "w", encoding="utf-8") as writer:
|
||||
writer.write("#version: 0.2\n")
|
||||
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
|
||||
if index != token_index:
|
||||
logger.warning(
|
||||
"Saving vocabulary to {}: BPE merge indices are not consecutive."
|
||||
" Please check that the tokenizer is not corrupted!".format(merge_file)
|
||||
)
|
||||
index = token_index
|
||||
writer.write(" ".join(bpe_tokens) + "\n")
|
||||
index += 1
|
||||
|
||||
return vocab_file, merge_file
|
||||
@@ -0,0 +1,103 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021, The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Fast tokenization class for BlenderbotSmall."""
|
||||
from typing import List, Optional
|
||||
|
||||
from tokenizers import ByteLevelBPETokenizer
|
||||
|
||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||
from ...utils import logging
|
||||
from .tokenization_blenderbot_small import BlenderbotSmallTokenizer
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
VOCAB_FILES_NAMES = {}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
"facebook/blenderbot_small-90M": 512,
|
||||
}
|
||||
|
||||
|
||||
class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
Path to the vocabulary file.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
slow_tokenizer_class = BlenderbotSmallTokenizer
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
merges_file,
|
||||
unk_token="<|endoftext|>",
|
||||
bos_token="<|endoftext|>",
|
||||
eos_token="<|endoftext|>",
|
||||
add_prefix_space=False,
|
||||
trim_offsets=True,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
ByteLevelBPETokenizer(
|
||||
vocab_file=vocab_file,
|
||||
merges_file=merges_file,
|
||||
add_prefix_space=add_prefix_space,
|
||||
trim_offsets=trim_offsets,
|
||||
),
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
**kwargs,
|
||||
)
|
||||
self.add_prefix_space = add_prefix_space
|
||||
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
|
||||
if token_ids_1 is None:
|
||||
return output
|
||||
|
||||
return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
|
||||
|
||||
def create_token_type_ids_from_sequences(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. BlenderbotSmall
|
||||
does not make use of token type ids, therefore a list of zeros is returned.
|
||||
|
||||
Args:
|
||||
token_ids_0 (:obj:`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (:obj:`List[int]`, `optional`):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: List of zeros.
|
||||
"""
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
|
||||
if token_ids_1 is None:
|
||||
return len(cls + token_ids_0 + sep) * [0]
|
||||
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
|
||||
@@ -77,10 +77,21 @@ ENCODER_DECODER_INPUTS_DOCSTRING = r"""
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Indices of decoder input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
|
||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||
details.
|
||||
|
||||
`What are input IDs? <../glossary.html#input-ids>`__
|
||||
|
||||
If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
|
||||
:obj:`past_key_values`).
|
||||
|
||||
Provide for sequence to sequence training to the decoder. Indices can be obtained using
|
||||
:class:`~transformers.PretrainedTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and
|
||||
:meth:`transformers.PreTrainedTokenizer.__call__` for details.
|
||||
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
|
||||
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
|
||||
also be used by default.
|
||||
encoder_outputs (:obj:`tuple(torch.FloatTensor)`, `optional`):
|
||||
|
||||
@@ -235,7 +235,7 @@ FSMT_INPUTS_DOCSTRING = r"""
|
||||
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Provide for translation and summarization training. By default, the model will create this tensor by
|
||||
shifting the input_ids right, following the paper.
|
||||
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
|
||||
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
|
||||
also be used by default. If you want to change padding behavior, you should read
|
||||
:func:`modeling_fstm._prepare_fstm_decoder_inputs` and modify. See diagram 1 in the paper for more info on
|
||||
|
||||
@@ -15,16 +15,20 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ...file_utils import is_sentencepiece_available, is_tf_available, is_torch_available
|
||||
from .configuration_marian import MarianConfig
|
||||
from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
|
||||
from .configuration_marian import MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP, MarianConfig
|
||||
|
||||
|
||||
if is_sentencepiece_available():
|
||||
from .tokenization_marian import MarianTokenizer
|
||||
|
||||
if is_torch_available():
|
||||
from .modeling_marian import MarianMTModel
|
||||
from .modeling_marian import (
|
||||
MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
MarianModel,
|
||||
MarianMTModel,
|
||||
MarianPreTrainedModel,
|
||||
)
|
||||
|
||||
if is_tf_available():
|
||||
from .modeling_tf_marian import TFMarianMTModel
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The OPUS-NMT Team, Marian team, and The HuggingFace Inc. team.
|
||||
# Copyright 2021 The Marian Team Authors and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -14,40 +14,48 @@
|
||||
# limitations under the License.
|
||||
""" Marian model configuration """
|
||||
|
||||
from ..bart.configuration_bart import BartConfig
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/config.json",
|
||||
# See all Marian models at https://huggingface.co/models?filter=marian
|
||||
}
|
||||
|
||||
|
||||
class MarianConfig(BartConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.MarianMTModel`. It is used to
|
||||
instantiate a Marian model according to the specified arguments, defining the model architecture.
|
||||
class MarianConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.MarianModel`. It is used to
|
||||
instantiate an Marian model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the Marian
|
||||
`Helsinki-NLP/opus-mt-en-de <https://huggingface.co/Helsinki-NLP/opus-mt-en-de>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 58101):
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 50265):
|
||||
Vocabulary size of the Marian model. Defines the number of different tokens that can be represented by the
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.MarianMTModel`.
|
||||
d_model (:obj:`int`, `optional`, defaults to 512):
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.MarianModel` or
|
||||
:class:`~transformers.TFMarianModel`.
|
||||
d_model (:obj:`int`, `optional`, defaults to 1024):
|
||||
Dimensionality of the layers and the pooler layer.
|
||||
encoder_layers (:obj:`int`, `optional`, defaults to 6):
|
||||
encoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of encoder layers.
|
||||
decoder_layers (:obj:`int`, `optional`, defaults to 6):
|
||||
decoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of decoder layers.
|
||||
encoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
|
||||
encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
decoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
|
||||
decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
|
||||
encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
|
||||
decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
|
||||
@@ -59,42 +67,113 @@ class MarianConfig(BartConfig):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for classifier.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
init_std (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
This should be completed, specific to marian.
|
||||
normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Call layernorm before attention ops.
|
||||
normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Call layernorm after embeddings.
|
||||
static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Don't learn positional embeddings, use sinusoidal.
|
||||
add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Why not add another layernorm?
|
||||
scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Scale embeddings by diving by sqrt(d_model).
|
||||
eos_token_id (:obj:`int`, `optional`, defaults to 2)
|
||||
End of stream token id.
|
||||
pad_token_id (:obj:`int`, `optional`, defaults to 1)
|
||||
Padding token id.
|
||||
bos_token_id (:obj:`int`, `optional`, defaults to 0)
|
||||
Beginning of stream token id.
|
||||
encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
|
||||
How many extra learned positional embeddings to use.
|
||||
is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether this is an encoder/decoder model
|
||||
force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
|
||||
"""
|
||||
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||
scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Scale embeddings by diving by sqrt(d_model).
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models)
|
||||
|
||||
Examples::
|
||||
|
||||
>>> from transformers import MarianModel, MarianConfig
|
||||
|
||||
>>> # Initializing a Marian Helsinki-NLP/opus-mt-en-de style configuration
|
||||
>>> configuration = MarianConfig()
|
||||
|
||||
>>> # Initializing a model from the Helsinki-NLP/opus-mt-en-de style configuration
|
||||
>>> model = MarianModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "marian"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50265,
|
||||
max_position_embeddings=1024,
|
||||
encoder_layers=12,
|
||||
encoder_ffn_dim=4096,
|
||||
encoder_attention_heads=16,
|
||||
decoder_layers=12,
|
||||
decoder_ffn_dim=4096,
|
||||
decoder_attention_heads=16,
|
||||
encoder_layerdrop=0.0,
|
||||
decoder_layerdrop=0.0,
|
||||
use_cache=True,
|
||||
is_encoder_decoder=True,
|
||||
activation_function="gelu",
|
||||
d_model=1024,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.0,
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
decoder_start_token_id=58100,
|
||||
classifier_dropout=0.0,
|
||||
scale_embedding=False,
|
||||
gradient_checkpointing=False,
|
||||
pad_token_id=58100,
|
||||
eos_token_id=0,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
self.encoder_ffn_dim = encoder_ffn_dim
|
||||
self.encoder_layers = encoder_layers
|
||||
self.encoder_attention_heads = encoder_attention_heads
|
||||
self.decoder_ffn_dim = decoder_ffn_dim
|
||||
self.decoder_layers = decoder_layers
|
||||
self.decoder_attention_heads = decoder_attention_heads
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation_dropout = activation_dropout
|
||||
self.activation_function = activation_function
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
# IMPORTANT
|
||||
# DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
|
||||
self.extra_pos_embeddings = 0
|
||||
self.normalize_before = False
|
||||
self.add_final_layer_norm = False
|
||||
self.do_blenderbot_90_layernorm = False
|
||||
self.normalize_embedding = False
|
||||
self.static_position_embeddings = True
|
||||
self.add_bias_logits = False
|
||||
self.force_bos_token_to_be_generated = False
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
|
||||
1208
src/transformers/models/marian/modeling_marian.py
Normal file → Executable file
1208
src/transformers/models/marian/modeling_marian.py
Normal file → Executable file
File diff suppressed because it is too large
Load Diff
@@ -84,7 +84,7 @@ class MarianTokenizer(PreTrainedTokenizer):
|
||||
>>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
|
||||
>>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
|
||||
>>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."] # optional
|
||||
>>> batch_enc: BatchEncoding = tok.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, return_tensors="pt")
|
||||
>>> batch_enc = tok.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, return_tensors="pt")
|
||||
>>> # keys [input_ids, attention_mask, labels].
|
||||
>>> # model(**batch) should work
|
||||
"""
|
||||
|
||||
@@ -15,9 +15,8 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
|
||||
from .configuration_mbart import MBartConfig
|
||||
from .configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig
|
||||
|
||||
|
||||
if is_sentencepiece_available():
|
||||
@@ -27,7 +26,14 @@ if is_tokenizers_available():
|
||||
from .tokenization_mbart_fast import MBartTokenizerFast
|
||||
|
||||
if is_torch_available():
|
||||
from .modeling_mbart import MBartForConditionalGeneration, MBartModel
|
||||
from .modeling_mbart import (
|
||||
MBART_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
MBartForConditionalGeneration,
|
||||
MBartForQuestionAnswering,
|
||||
MBartForSequenceClassification,
|
||||
MBartModel,
|
||||
MBartPreTrainedModel,
|
||||
)
|
||||
|
||||
if is_tf_available():
|
||||
from .modeling_tf_mbart import TFMBartForConditionalGeneration
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
|
||||
# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -12,33 +12,36 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" MBART configuration """
|
||||
""" MBART model configuration """
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ..bart.configuration_bart import BartConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
MBART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/config.json",
|
||||
"facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/config.json",
|
||||
# See all MBART models at https://huggingface.co/models?filter=mbart
|
||||
}
|
||||
|
||||
|
||||
class MBartConfig(BartConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a
|
||||
:class:`~transformers.MBartForConditionalGeneration`. It is used to instantiate a BART model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
class MBartConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.MBartModel`. It is used to
|
||||
instantiate an MBART model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the MBART `facebook/mbart-large-cc25
|
||||
<https://huggingface.co/facebook/mbart-large-cc25>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 250027):
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 50265):
|
||||
Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.MBartForConditionalGeneration`.
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.MBartModel` or
|
||||
:class:`~transformers.TFMBartModel`.
|
||||
d_model (:obj:`int`, `optional`, defaults to 1024):
|
||||
Dimensionality of the layers and the pooler layer.
|
||||
encoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
@@ -50,9 +53,9 @@ class MBartConfig(BartConfig):
|
||||
decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
|
||||
@@ -69,37 +72,108 @@ class MBartConfig(BartConfig):
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
init_std (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
This should be completed, specific to marian.
|
||||
normalize_before (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Call layernorm before attention ops.
|
||||
normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Call layernorm after embeddings. Only True for Bart.
|
||||
static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Don't learn positional embeddings, use sinusoidal.
|
||||
add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Why not add another layernorm?
|
||||
scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Scale embeddings by diving by sqrt(d_model).
|
||||
eos_token_id (:obj:`int`, `optional`, defaults to 2)
|
||||
End of stream token id.
|
||||
pad_token_id (:obj:`int`, `optional`, defaults to 1)
|
||||
Padding token id.
|
||||
bos_token_id (:obj:`int`, `optional`, defaults to 0)
|
||||
Beginning of stream token id.
|
||||
encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
|
||||
How many extra learned positional embeddings to use. Should be equal to :obj:`pad_token_id+1`.
|
||||
is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether this is an encoder/decoder model
|
||||
force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
|
||||
"""
|
||||
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||
scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Scale embeddings by diving by sqrt(d_model).
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models)
|
||||
|
||||
Example::
|
||||
|
||||
>>> from transformers import MBartModel, MBartConfig
|
||||
|
||||
>>> # Initializing a MBART facebook/mbart-large-cc25 style configuration
|
||||
>>> configuration = MBartConfig()
|
||||
|
||||
>>> # Initializing a model from the facebook/mbart-large-cc25 style configuration
|
||||
>>> model = MBartModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "mbart"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50265,
|
||||
max_position_embeddings=1024,
|
||||
encoder_layers=12,
|
||||
encoder_ffn_dim=4096,
|
||||
encoder_attention_heads=16,
|
||||
decoder_layers=12,
|
||||
decoder_ffn_dim=4096,
|
||||
decoder_attention_heads=16,
|
||||
encoder_layerdrop=0.0,
|
||||
decoder_layerdrop=0.0,
|
||||
use_cache=True,
|
||||
is_encoder_decoder=True,
|
||||
activation_function="gelu",
|
||||
d_model=1024,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.0,
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
classifier_dropout=0.0,
|
||||
scale_embedding=False,
|
||||
gradient_checkpointing=False,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
self.encoder_ffn_dim = encoder_ffn_dim
|
||||
self.encoder_layers = encoder_layers
|
||||
self.encoder_attention_heads = encoder_attention_heads
|
||||
self.decoder_ffn_dim = decoder_ffn_dim
|
||||
self.decoder_layers = decoder_layers
|
||||
self.decoder_attention_heads = decoder_attention_heads
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation_dropout = activation_dropout
|
||||
self.activation_function = activation_function
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
# IMPORTANT
|
||||
# DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
|
||||
self.extra_pos_embeddings = 2
|
||||
self.normalize_before = True
|
||||
self.add_final_layer_norm = True
|
||||
self.do_blenderbot_90_layernorm = False
|
||||
self.normalize_embedding = True
|
||||
self.static_position_embeddings = False
|
||||
self.add_bias_logits = False
|
||||
self.force_bos_token_to_be_generated = False
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
|
||||
1472
src/transformers/models/mbart/modeling_mbart.py
Normal file → Executable file
1472
src/transformers/models/mbart/modeling_mbart.py
Normal file → Executable file
File diff suppressed because it is too large
Load Diff
@@ -15,9 +15,8 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
|
||||
from .configuration_pegasus import PegasusConfig
|
||||
from .configuration_pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig
|
||||
|
||||
|
||||
if is_sentencepiece_available():
|
||||
@@ -27,7 +26,12 @@ if is_tokenizers_available():
|
||||
from .tokenization_pegasus_fast import PegasusTokenizerFast
|
||||
|
||||
if is_torch_available():
|
||||
from .modeling_pegasus import PegasusForConditionalGeneration, PegasusModel
|
||||
from .modeling_pegasus import (
|
||||
PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
PegasusForConditionalGeneration,
|
||||
PegasusModel,
|
||||
PegasusPreTrainedModel,
|
||||
)
|
||||
|
||||
if is_tf_available():
|
||||
from .modeling_tf_pegasus import TFPegasusForConditionalGeneration
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 Google and The HuggingFace Inc. team.
|
||||
# Copyright 2021, Google and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -14,84 +14,48 @@
|
||||
# limitations under the License.
|
||||
""" PEGASUS model configuration """
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
from ..bart.configuration_bart import BartConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
# These config values do not vary between checkpoints
|
||||
DEFAULTS = dict(
|
||||
vocab_size=96103,
|
||||
max_position_embeddings=512,
|
||||
d_model=1024,
|
||||
encoder_ffn_dim=4096,
|
||||
decoder_ffn_dim=4096,
|
||||
encoder_attention_heads=16,
|
||||
decoder_attention_heads=16,
|
||||
encoder_layers=16,
|
||||
decoder_layers=16,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.1,
|
||||
activation_dropout=0.1,
|
||||
pad_token_id=0,
|
||||
eos_token_id=1,
|
||||
is_encoder_decoder=True,
|
||||
normalize_before=True,
|
||||
scale_embedding=True,
|
||||
normalize_embedding=False,
|
||||
add_final_layer_norm=True,
|
||||
static_position_embeddings=True,
|
||||
num_beams=8,
|
||||
activation_function="relu",
|
||||
)
|
||||
# Config values that vary between checkpoints: for testing and conversion
|
||||
task_specific_params = {
|
||||
# These are task specific params for pegasus-large and normal params for finetuned checkpoints
|
||||
"summarization_xsum": {"length_penalty": 0.6, "max_length": 64, "max_position_embeddings": 512},
|
||||
"summarization_cnn_dailymail": {"length_penalty": 0.8, "max_length": 128, "max_position_embeddings": 1024},
|
||||
"summarization_newsroom": {"length_penalty": 0.8, "max_length": 128, "max_position_embeddings": 512},
|
||||
"summarization_wikihow": {"length_penalty": 0.6, "max_length": 256, "max_position_embeddings": 512},
|
||||
"summarization_multi_news": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
|
||||
"summarization_reddit_tifu": {"length_penalty": 0.6, "max_length": 128, "max_position_embeddings": 512},
|
||||
"summarization_big_patent": {"length_penalty": 0.7, "max_length": 256, "max_position_embeddings": 1024},
|
||||
"summarization_arxiv": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
|
||||
"summarization_pubmed": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
|
||||
"summarization_gigaword": {"length_penalty": 0.6, "max_length": 32, "max_position_embeddings": 128},
|
||||
"summarization_aeslc": {"length_penalty": 0.6, "max_length": 32, "max_position_embeddings": 512},
|
||||
"summarization_billsum": {"length_penalty": 0.6, "max_length": 256, "max_position_embeddings": 1024},
|
||||
# this last entry is useless -- just for consistency
|
||||
"summarization_large": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
|
||||
PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/config.json",
|
||||
# See all PEGASUS models at https://huggingface.co/models?filter=pegasus
|
||||
}
|
||||
|
||||
|
||||
class PegasusConfig(BartConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of a
|
||||
:class:`~transformers.PegasusForConditionalGeneration`. It is used to instantiate a Pegasus model according to the
|
||||
specified arguments, defining the model architecture.
|
||||
class PegasusConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.PegasusModel`. It is used to
|
||||
instantiate an PEGASUS model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the PEGASUS `google/pegasus-large
|
||||
<https://huggingface.co/google/pegasus-large>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 96103):
|
||||
Vocabulary size of the Pegasus model. Defines the number of different tokens that can be represented by the
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.PegasusForConditionalGeneration`.
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 50265):
|
||||
Vocabulary size of the PEGASUS model. Defines the number of different tokens that can be represented by the
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.PegasusModel` or
|
||||
:class:`~transformers.TFPegasusModel`.
|
||||
d_model (:obj:`int`, `optional`, defaults to 1024):
|
||||
Dimensionality of the layers and the pooler layer.
|
||||
encoder_layers (:obj:`int`, `optional`, defaults to 16):
|
||||
encoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of encoder layers.
|
||||
decoder_layers (:obj:`int`, `optional`, defaults to 16):
|
||||
decoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of decoder layers.
|
||||
encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
|
||||
@@ -108,38 +72,108 @@ class PegasusConfig(BartConfig):
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
init_std (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
This should be completed, specific to marian.
|
||||
normalize_before (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Call layernorm before attention ops.
|
||||
normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Call layernorm after embeddings.
|
||||
static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Don't learn positional embeddings, use sinusoidal.
|
||||
add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Why not add another layernorm?
|
||||
scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Scale embeddings by diving by sqrt(d_model).
|
||||
eos_token_id (:obj:`int`, `optional`, defaults to 2)
|
||||
End of stream token id.
|
||||
pad_token_id (:obj:`int`, `optional`, defaults to 1)
|
||||
Padding token id.
|
||||
bos_token_id (:obj:`int`, `optional`, defaults to 0)
|
||||
Beginning of stream token id.
|
||||
encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
|
||||
How many extra learned positional embeddings to use. Should be pad_token_id+1 for bart.
|
||||
is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether this is an encoder/decoder model
|
||||
force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
|
||||
"""
|
||||
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||
scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Scale embeddings by diving by sqrt(d_model).
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models)
|
||||
|
||||
Example::
|
||||
|
||||
>>> from transformers import PegasusModel, PegasusConfig
|
||||
|
||||
>>> # Initializing a PEGASUS google/pegasus-large style configuration
|
||||
>>> configuration = PegasusConfig()
|
||||
|
||||
>>> # Initializing a model from the google/pegasus-large style configuration
|
||||
>>> model = PegasusModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "pegasus"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
# The implementation of the config object is in BartConfig
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50265,
|
||||
max_position_embeddings=1024,
|
||||
encoder_layers=12,
|
||||
encoder_ffn_dim=4096,
|
||||
encoder_attention_heads=16,
|
||||
decoder_layers=12,
|
||||
decoder_ffn_dim=4096,
|
||||
decoder_attention_heads=16,
|
||||
encoder_layerdrop=0.0,
|
||||
decoder_layerdrop=0.0,
|
||||
use_cache=True,
|
||||
is_encoder_decoder=True,
|
||||
activation_function="gelu",
|
||||
d_model=1024,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.0,
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
decoder_start_token_id=0,
|
||||
classifier_dropout=0.0,
|
||||
scale_embedding=False,
|
||||
gradient_checkpointing=False,
|
||||
pad_token_id=0,
|
||||
eos_token_id=1,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.d_model = d_model
|
||||
self.encoder_ffn_dim = encoder_ffn_dim
|
||||
self.encoder_layers = encoder_layers
|
||||
self.encoder_attention_heads = encoder_attention_heads
|
||||
self.decoder_ffn_dim = decoder_ffn_dim
|
||||
self.decoder_layers = decoder_layers
|
||||
self.decoder_attention_heads = decoder_attention_heads
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation_dropout = activation_dropout
|
||||
self.activation_function = activation_function
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
|
||||
# IMPORTANT
|
||||
# DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
|
||||
self.extra_pos_embeddings = 0
|
||||
self.normalize_before = True
|
||||
self.add_final_layer_norm = True
|
||||
self.do_blenderbot_90_layernorm = False
|
||||
self.normalize_embedding = False
|
||||
self.static_position_embeddings = True
|
||||
self.add_bias_logits = False
|
||||
self.force_bos_token_to_be_generated = False
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
|
||||
1236
src/transformers/models/pegasus/modeling_pegasus.py
Normal file → Executable file
1236
src/transformers/models/pegasus/modeling_pegasus.py
Normal file → Executable file
File diff suppressed because it is too large
Load Diff
@@ -88,9 +88,19 @@ PROPHETNET_INPUTS_DOCSTRING = r"""
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Provide for translation and summarization training. By default, the model will create this tensor by
|
||||
shifting the :obj:`input_ids` to the right, following the paper.
|
||||
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
|
||||
Indices of decoder input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
|
||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||
details.
|
||||
|
||||
`What are input IDs? <../glossary.html#input-ids>`__
|
||||
|
||||
ProphetNet uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
|
||||
:obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
|
||||
:obj:`past_key_values`).
|
||||
|
||||
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
|
||||
also be used by default.
|
||||
|
||||
|
||||
@@ -1028,14 +1028,22 @@ T5_INPUTS_DOCSTRING = r"""
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Provide for sequence to sequence training. T5 uses the :obj:`pad_token_id` as the starting token for
|
||||
:obj:`decoder_input_ids` generation. If :obj:`past_key_values` is used, optionally only the last
|
||||
:obj:`decoder_input_ids` have to be input (see :obj:`past_key_values`).
|
||||
Indices of decoder input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using :class:`~transformers.BartTokenizer`. See
|
||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||
details.
|
||||
|
||||
`What are input IDs? <../glossary.html#input-ids>`__
|
||||
|
||||
T5 uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
|
||||
:obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
|
||||
:obj:`past_key_values`).
|
||||
|
||||
To know more on how to prepare :obj:`decoder_input_ids` for pretraining take a look at `T5 Training
|
||||
<./t5.html#training>`__. If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset,
|
||||
:obj:`decoder_input_ids` takes the value of :obj:`input_ids`.
|
||||
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
|
||||
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
|
||||
also be used by default.
|
||||
encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
|
||||
|
||||
@@ -922,7 +922,7 @@ T5_INPUTS_DOCSTRING = r"""
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
|
||||
decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
|
||||
also be used by default.
|
||||
encoder_outputs (:obj:`tuple(tuple(tf.FloatTensor)`, `optional`):
|
||||
|
||||
@@ -609,6 +609,27 @@ class BlenderbotModel:
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
class BlenderbotSmallForConditionalGeneration:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class BlenderbotSmallModel:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
@@ -1327,6 +1348,15 @@ class LxmertXLayer:
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class MarianModel:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class MarianMTModel:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
@@ -1345,6 +1375,24 @@ class MBartForConditionalGeneration:
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class MBartForQuestionAnswering:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class MBartForSequenceClassification:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class MBartModel:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
Reference in New Issue
Block a user