LED (#9278)
* create model * add integration * save current state * make integration tests pass * add one more test * add explanation to tests * remove from bart * add padding * remove unnecessary test * make all tests pass * re-add cookie cutter tests * finish PyTorch * fix attention test * Update tests/test_modeling_common.py * revert change * remove unused file * add string to doc * save intermediate * make tf integration tests pass * finish tf * fix doc * fix docs again * add led to doctree * add to auto tokenizer * added tips for led * make style * apply jplus statements * correct tf longformer * apply lysandres suggestions * apply sylvains suggestions * Apply suggestions from code review
This commit is contained in:
committed by
GitHub
parent
314cca2842
commit
189387e9b2
@@ -146,6 +146,7 @@ from .models.funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig, F
|
||||
from .models.gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2Tokenizer
|
||||
from .models.herbert import HerbertTokenizer
|
||||
from .models.layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMConfig, LayoutLMTokenizer
|
||||
from .models.led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig, LEDTokenizer
|
||||
from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer
|
||||
from .models.lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig, LxmertTokenizer
|
||||
from .models.marian import MarianConfig
|
||||
@@ -255,6 +256,7 @@ if is_tokenizers_available():
|
||||
from .models.gpt2 import GPT2TokenizerFast
|
||||
from .models.herbert import HerbertTokenizerFast
|
||||
from .models.layoutlm import LayoutLMTokenizerFast
|
||||
from .models.led import LEDTokenizerFast
|
||||
from .models.longformer import LongformerTokenizerFast
|
||||
from .models.lxmert import LxmertTokenizerFast
|
||||
from .models.mbart import MBartTokenizerFast
|
||||
@@ -299,6 +301,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
# Modeling
|
||||
if is_torch_available():
|
||||
|
||||
# Benchmarks
|
||||
from .benchmark.benchmark import PyTorchBenchmark
|
||||
from .benchmark.benchmark_args import PyTorchBenchmarkArguments
|
||||
@@ -507,6 +510,13 @@ if is_torch_available():
|
||||
LayoutLMForTokenClassification,
|
||||
LayoutLMModel,
|
||||
)
|
||||
from .models.led import (
|
||||
LED_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
LEDForConditionalGeneration,
|
||||
LEDForQuestionAnswering,
|
||||
LEDForSequenceClassification,
|
||||
LEDModel,
|
||||
)
|
||||
from .models.longformer import (
|
||||
LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
LongformerForMaskedLM,
|
||||
@@ -695,6 +705,7 @@ else:
|
||||
|
||||
# TensorFlow
|
||||
if is_tf_available():
|
||||
|
||||
from .benchmark.benchmark_args_tf import TensorFlowBenchmarkArguments
|
||||
|
||||
# Benchmarks
|
||||
@@ -831,6 +842,7 @@ if is_tf_available():
|
||||
TFGPT2Model,
|
||||
TFGPT2PreTrainedModel,
|
||||
)
|
||||
from .models.led import TFLEDForConditionalGeneration, TFLEDModel, TFLEDPreTrainedModel
|
||||
from .models.longformer import (
|
||||
TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
TFLongformerForMaskedLM,
|
||||
|
||||
@@ -615,6 +615,7 @@ SLOW_TO_FAST_CONVERTERS = {
|
||||
"HerbertTokenizer": HerbertConverter,
|
||||
"LayoutLMTokenizer": BertConverter,
|
||||
"LongformerTokenizer": RobertaConverter,
|
||||
"LEDTokenizer": RobertaConverter,
|
||||
"LxmertTokenizer": BertConverter,
|
||||
"MBartTokenizer": MBartConverter,
|
||||
"MPNetTokenizer": MPNetConverter,
|
||||
|
||||
@@ -35,6 +35,7 @@ from ..fsmt.configuration_fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTCo
|
||||
from ..funnel.configuration_funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig
|
||||
from ..gpt2.configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
|
||||
from ..layoutlm.configuration_layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMConfig
|
||||
from ..led.configuration_led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig
|
||||
from ..longformer.configuration_longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig
|
||||
from ..lxmert.configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig
|
||||
from ..marian.configuration_marian import MarianConfig
|
||||
@@ -66,6 +67,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
|
||||
(key, value)
|
||||
for pretrained_map in [
|
||||
# Add archive maps here
|
||||
LED_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
BART_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
@@ -105,6 +107,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
|
||||
CONFIG_MAPPING = OrderedDict(
|
||||
[
|
||||
# Add configs here
|
||||
("led", LEDConfig),
|
||||
("retribert", RetriBertConfig),
|
||||
("mt5", MT5Config),
|
||||
("t5", T5Config),
|
||||
@@ -150,6 +153,7 @@ CONFIG_MAPPING = OrderedDict(
|
||||
MODEL_NAMES_MAPPING = OrderedDict(
|
||||
[
|
||||
# Add full (and cased) model names here
|
||||
("led", "LED"),
|
||||
("retribert", "RetriBERT"),
|
||||
("t5", "T5"),
|
||||
("mobilebert", "MobileBERT"),
|
||||
|
||||
@@ -101,6 +101,12 @@ from ..funnel.modeling_funnel import (
|
||||
)
|
||||
from ..gpt2.modeling_gpt2 import GPT2ForSequenceClassification, GPT2LMHeadModel, GPT2Model
|
||||
from ..layoutlm.modeling_layoutlm import LayoutLMForMaskedLM, LayoutLMForTokenClassification, LayoutLMModel
|
||||
from ..led.modeling_led import (
|
||||
LEDForConditionalGeneration,
|
||||
LEDForQuestionAnswering,
|
||||
LEDForSequenceClassification,
|
||||
LEDModel,
|
||||
)
|
||||
from ..longformer.modeling_longformer import (
|
||||
LongformerForMaskedLM,
|
||||
LongformerForMultipleChoice,
|
||||
@@ -221,6 +227,7 @@ from .configuration_auto import (
|
||||
FunnelConfig,
|
||||
GPT2Config,
|
||||
LayoutLMConfig,
|
||||
LEDConfig,
|
||||
LongformerConfig,
|
||||
LxmertConfig,
|
||||
MarianConfig,
|
||||
@@ -252,6 +259,7 @@ logger = logging.get_logger(__name__)
|
||||
MODEL_MAPPING = OrderedDict(
|
||||
[
|
||||
# Base model mapping
|
||||
(LEDConfig, LEDModel),
|
||||
(RetriBertConfig, RetriBertModel),
|
||||
(MT5Config, MT5Model),
|
||||
(T5Config, T5Model),
|
||||
@@ -327,6 +335,7 @@ MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
|
||||
MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
|
||||
[
|
||||
# Model with LM heads mapping
|
||||
(LEDConfig, LEDForConditionalGeneration),
|
||||
(LayoutLMConfig, LayoutLMForMaskedLM),
|
||||
(T5Config, T5ForConditionalGeneration),
|
||||
(DistilBertConfig, DistilBertForMaskedLM),
|
||||
@@ -407,6 +416,7 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
|
||||
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
|
||||
[
|
||||
# Model for Seq2Seq Causal LM mapping
|
||||
(LEDConfig, LEDForConditionalGeneration),
|
||||
(MT5Config, MT5ForConditionalGeneration),
|
||||
(T5Config, T5ForConditionalGeneration),
|
||||
(PegasusConfig, PegasusForConditionalGeneration),
|
||||
@@ -424,6 +434,7 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
|
||||
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
|
||||
[
|
||||
# Model for Sequence Classification mapping
|
||||
(LEDConfig, LEDForSequenceClassification),
|
||||
(DistilBertConfig, DistilBertForSequenceClassification),
|
||||
(AlbertConfig, AlbertForSequenceClassification),
|
||||
(CamembertConfig, CamembertForSequenceClassification),
|
||||
@@ -453,6 +464,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
|
||||
MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
|
||||
[
|
||||
# Model for Question Answering mapping
|
||||
(LEDConfig, LEDForQuestionAnswering),
|
||||
(DistilBertConfig, DistilBertForQuestionAnswering),
|
||||
(AlbertConfig, AlbertForQuestionAnswering),
|
||||
(CamembertConfig, CamembertForQuestionAnswering),
|
||||
|
||||
@@ -90,6 +90,7 @@ from ..funnel.modeling_tf_funnel import (
|
||||
TFFunnelModel,
|
||||
)
|
||||
from ..gpt2.modeling_tf_gpt2 import TFGPT2ForSequenceClassification, TFGPT2LMHeadModel, TFGPT2Model
|
||||
from ..led.modeling_tf_led import TFLEDForConditionalGeneration, TFLEDModel
|
||||
from ..longformer.modeling_tf_longformer import (
|
||||
TFLongformerForMaskedLM,
|
||||
TFLongformerForMultipleChoice,
|
||||
@@ -174,6 +175,7 @@ from .configuration_auto import (
|
||||
FlaubertConfig,
|
||||
FunnelConfig,
|
||||
GPT2Config,
|
||||
LEDConfig,
|
||||
LongformerConfig,
|
||||
LxmertConfig,
|
||||
MarianConfig,
|
||||
@@ -199,6 +201,7 @@ logger = logging.get_logger(__name__)
|
||||
TF_MODEL_MAPPING = OrderedDict(
|
||||
[
|
||||
# Base model mapping
|
||||
(LEDConfig, TFLEDModel),
|
||||
(LxmertConfig, TFLxmertModel),
|
||||
(MT5Config, TFMT5Model),
|
||||
(T5Config, TFT5Model),
|
||||
@@ -254,6 +257,7 @@ TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
|
||||
TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
|
||||
[
|
||||
# Model with LM heads mapping
|
||||
(LEDConfig, TFLEDForConditionalGeneration),
|
||||
(T5Config, TFT5ForConditionalGeneration),
|
||||
(DistilBertConfig, TFDistilBertForMaskedLM),
|
||||
(AlbertConfig, TFAlbertForMaskedLM),
|
||||
@@ -317,6 +321,7 @@ TF_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
|
||||
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
|
||||
[
|
||||
# Model for Seq2Seq Causal LM mapping
|
||||
(LEDConfig, TFLEDForConditionalGeneration),
|
||||
(MT5Config, TFMT5ForConditionalGeneration),
|
||||
(T5Config, TFT5ForConditionalGeneration),
|
||||
(MarianConfig, TFMarianMTModel),
|
||||
|
||||
@@ -36,6 +36,7 @@ from ..funnel.tokenization_funnel import FunnelTokenizer
|
||||
from ..gpt2.tokenization_gpt2 import GPT2Tokenizer
|
||||
from ..herbert.tokenization_herbert import HerbertTokenizer
|
||||
from ..layoutlm.tokenization_layoutlm import LayoutLMTokenizer
|
||||
from ..led.tokenization_led import LEDTokenizer
|
||||
from ..longformer.tokenization_longformer import LongformerTokenizer
|
||||
from ..lxmert.tokenization_lxmert import LxmertTokenizer
|
||||
from ..mobilebert.tokenization_mobilebert import MobileBertTokenizer
|
||||
@@ -69,6 +70,7 @@ from .configuration_auto import (
|
||||
FunnelConfig,
|
||||
GPT2Config,
|
||||
LayoutLMConfig,
|
||||
LEDConfig,
|
||||
LongformerConfig,
|
||||
LxmertConfig,
|
||||
MarianConfig,
|
||||
@@ -137,6 +139,7 @@ if is_tokenizers_available():
|
||||
from ..gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
|
||||
from ..herbert.tokenization_herbert_fast import HerbertTokenizerFast
|
||||
from ..layoutlm.tokenization_layoutlm_fast import LayoutLMTokenizerFast
|
||||
from ..led.tokenization_led_fast import LEDTokenizerFast
|
||||
from ..longformer.tokenization_longformer_fast import LongformerTokenizerFast
|
||||
from ..lxmert.tokenization_lxmert_fast import LxmertTokenizerFast
|
||||
from ..mbart.tokenization_mbart_fast import MBartTokenizerFast
|
||||
@@ -226,6 +229,7 @@ TOKENIZER_MAPPING = OrderedDict(
|
||||
(ProphetNetConfig, (ProphetNetTokenizer, None)),
|
||||
(MPNetConfig, (MPNetTokenizer, MPNetTokenizerFast)),
|
||||
(TapasConfig, (TapasTokenizer, None)),
|
||||
(LEDConfig, (LEDTokenizer, LEDTokenizerFast)),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
38
src/transformers/models/led/__init__.py
Normal file
38
src/transformers/models/led/__init__.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available
|
||||
from .configuration_led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig
|
||||
from .tokenization_led import LEDTokenizer
|
||||
|
||||
|
||||
if is_tokenizers_available():
|
||||
from .tokenization_led_fast import LEDTokenizerFast
|
||||
|
||||
if is_torch_available():
|
||||
from .modeling_led import (
|
||||
LED_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
LEDForConditionalGeneration,
|
||||
LEDForQuestionAnswering,
|
||||
LEDForSequenceClassification,
|
||||
LEDModel,
|
||||
LEDPreTrainedModel,
|
||||
)
|
||||
|
||||
|
||||
if is_tf_available():
|
||||
from .modeling_tf_led import TFLEDForConditionalGeneration, TFLEDModel, TFLEDPreTrainedModel
|
||||
179
src/transformers/models/led/configuration_led.py
Normal file
179
src/transformers/models/led/configuration_led.py
Normal file
@@ -0,0 +1,179 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" LED model configuration """
|
||||
|
||||
from typing import List, Union
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
LED_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/config.json",
|
||||
# See all LED models at https://huggingface.co/models?filter=led
|
||||
}
|
||||
|
||||
|
||||
class LEDConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.LEDModel`. It is used to
|
||||
instantiate an LED model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the LED `allenai/led-base-16384
|
||||
<https://huggingface.co/allenai/led-base-16384>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 50265):
|
||||
Vocabulary size of the LED model. Defines the number of different tokens that can be represented by the
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.LEDModel` or :class:`~transformers.TFLEDModel`.
|
||||
d_model (:obj:`int`, `optional`, defaults to 1024):
|
||||
Dimensionality of the layers and the pooler layer.
|
||||
encoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of encoder layers.
|
||||
decoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of decoder layers.
|
||||
encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
|
||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for classifier.
|
||||
max_encoder_position_embeddings (:obj:`int`, `optional`, defaults to 16384):
|
||||
The maximum sequence length that the encoder might ever be used with.
|
||||
max_decoder_position_embeddings (:obj:`int`, `optional`, defaults to 16384):
|
||||
The maximum sequence length that the decoder might ever be used with.
|
||||
init_std (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models)
|
||||
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
|
||||
|
||||
Example::
|
||||
|
||||
>>> from transformers import LEDModel, LEDConfig
|
||||
|
||||
>>> # Initializing a LED allenai/led-base-16384 style configuration
|
||||
>>> configuration = LEDConfig()
|
||||
|
||||
>>> # Initializing a model from the allenai/led-base-16384 style configuration
|
||||
>>> model = LEDModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "led"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=50265,
|
||||
max_encoder_position_embeddings=16384,
|
||||
max_decoder_position_embeddings=1024,
|
||||
encoder_layers=12,
|
||||
encoder_ffn_dim=4096,
|
||||
encoder_attention_heads=16,
|
||||
decoder_layers=12,
|
||||
decoder_ffn_dim=4096,
|
||||
decoder_attention_heads=16,
|
||||
encoder_layerdrop=0.0,
|
||||
decoder_layerdrop=0.0,
|
||||
use_cache=True,
|
||||
is_encoder_decoder=True,
|
||||
activation_function="gelu",
|
||||
d_model=1024,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.0,
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
decoder_start_token_id=2,
|
||||
classifier_dropout=0.0,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
gradient_checkpointing=False,
|
||||
attention_window: Union[List[int], int] = 512,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.max_encoder_position_embeddings = max_encoder_position_embeddings
|
||||
self.max_decoder_position_embeddings = max_decoder_position_embeddings
|
||||
self.d_model = d_model
|
||||
self.encoder_ffn_dim = encoder_ffn_dim
|
||||
self.encoder_layers = encoder_layers
|
||||
self.encoder_attention_heads = encoder_attention_heads
|
||||
self.decoder_ffn_dim = decoder_ffn_dim
|
||||
self.decoder_layers = decoder_layers
|
||||
self.decoder_attention_heads = decoder_attention_heads
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation_dropout = activation_dropout
|
||||
self.activation_function = activation_function
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.attention_window = attention_window
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
|
||||
@property
|
||||
def attention_probs_dropout_prob(self) -> float:
|
||||
return self.attention_dropout
|
||||
|
||||
@property
|
||||
def initializer_range(self) -> float:
|
||||
return self.init_std
|
||||
2476
src/transformers/models/led/modeling_led.py
Executable file
2476
src/transformers/models/led/modeling_led.py
Executable file
File diff suppressed because it is too large
Load Diff
2247
src/transformers/models/led/modeling_tf_led.py
Normal file
2247
src/transformers/models/led/modeling_tf_led.py
Normal file
File diff suppressed because it is too large
Load Diff
51
src/transformers/models/led/tokenization_led.py
Normal file
51
src/transformers/models/led/tokenization_led.py
Normal file
@@ -0,0 +1,51 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for LED."""
|
||||
from ...utils import logging
|
||||
from ..bart.tokenization_bart import BartTokenizer
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/vocab.json",
|
||||
},
|
||||
"merges_file": {
|
||||
"allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/merges.txt",
|
||||
},
|
||||
"tokenizer_file": {
|
||||
"allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/tokenizer.json",
|
||||
},
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
"allenai/led-base-16384": 16384,
|
||||
}
|
||||
|
||||
|
||||
class LEDTokenizer(BartTokenizer):
|
||||
"""
|
||||
Construct a LED tokenizer.
|
||||
|
||||
:class:`~transformers.LEDTokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end
|
||||
tokenization: punctuation splitting and wordpiece.
|
||||
|
||||
Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning
|
||||
parameters.
|
||||
"""
|
||||
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
53
src/transformers/models/led/tokenization_led_fast.py
Normal file
53
src/transformers/models/led/tokenization_led_fast.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for LED."""
|
||||
from ...utils import logging
|
||||
from ..bart.tokenization_bart_fast import BartTokenizerFast
|
||||
from .tokenization_led import LEDTokenizer
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/vocab.json",
|
||||
},
|
||||
"merges_file": {
|
||||
"allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/merges.txt",
|
||||
},
|
||||
"tokenizer_file": {
|
||||
"allenai/led-base-16384": "https://huggingface.co/allenai/led-base-16384/resolve/main/tokenizer.json",
|
||||
},
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
"allenai/led-base-16384": 16384,
|
||||
}
|
||||
|
||||
|
||||
class LEDTokenizerFast(BartTokenizerFast):
|
||||
r"""
|
||||
Construct a "fast" LED tokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
|
||||
:class:`~transformers.LEDTokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs
|
||||
end-to-end tokenization: punctuation splitting and wordpiece.
|
||||
|
||||
Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning
|
||||
parameters.
|
||||
"""
|
||||
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
slow_tokenizer_class = LEDTokenizer
|
||||
@@ -549,13 +549,19 @@ class LongformerSelfAttention(nn.Module):
|
||||
self.one_sided_attn_window_size = attention_window // 2
|
||||
|
||||
def forward(
|
||||
self, hidden_states, attention_mask=None, is_index_masked=None, is_index_global_attn=None, is_global_attn=None
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
is_index_masked=None,
|
||||
is_index_global_attn=None,
|
||||
is_global_attn=None,
|
||||
output_attentions=False,
|
||||
):
|
||||
"""
|
||||
LongformerSelfAttention expects `len(hidden_states)` to be multiple of `attention_window`. Padding to
|
||||
`attention_window` happens in LongformerModel.forward to avoid redoing the padding on each layer.
|
||||
:class:`LongformerSelfAttention` expects `len(hidden_states)` to be multiple of `attention_window`. Padding to
|
||||
`attention_window` happens in :meth:`LongformerModel.forward` to avoid redoing the padding on each layer.
|
||||
|
||||
The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to -ve: no attention
|
||||
The `attention_mask` is changed in :meth:`BertModel.forward` from 0, 1, 2 to -ve: no attention
|
||||
|
||||
0: local attention
|
||||
+ve: global attention
|
||||
@@ -631,17 +637,17 @@ class LongformerSelfAttention(nn.Module):
|
||||
# free memory
|
||||
del global_key_attn_scores
|
||||
|
||||
local_attn_probs_fp32 = F.softmax(attn_scores, dim=-1, dtype=torch.float32) # use fp32 for numerical stability
|
||||
local_attn_probs = local_attn_probs_fp32.type_as(attn_scores)
|
||||
|
||||
# free memory
|
||||
del local_attn_probs_fp32
|
||||
attn_probs = F.softmax(attn_scores, dim=-1, dtype=torch.float32) # use fp32 for numerical stability
|
||||
|
||||
# softmax sometimes inserts NaN if all positions are masked, replace them with 0
|
||||
local_attn_probs = torch.masked_fill(local_attn_probs, is_index_masked[:, :, None, None], 0.0)
|
||||
attn_probs = torch.masked_fill(attn_probs, is_index_masked[:, :, None, None], 0.0)
|
||||
attn_probs = attn_probs.type_as(attn_scores)
|
||||
|
||||
# free memory
|
||||
del attn_scores
|
||||
|
||||
# apply dropout
|
||||
local_attn_probs = F.dropout(local_attn_probs, p=self.dropout, training=self.training)
|
||||
attn_probs = F.dropout(attn_probs, p=self.dropout, training=self.training)
|
||||
|
||||
value_vectors = value_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1)
|
||||
|
||||
@@ -650,7 +656,7 @@ class LongformerSelfAttention(nn.Module):
|
||||
# compute sum of global and local attn
|
||||
attn_output = self._compute_attn_output_with_global_indices(
|
||||
value_vectors=value_vectors,
|
||||
attn_probs=local_attn_probs,
|
||||
attn_probs=attn_probs,
|
||||
max_num_global_attn_indices=max_num_global_attn_indices,
|
||||
is_index_global_attn_nonzero=is_index_global_attn_nonzero,
|
||||
is_local_index_global_attn_nonzero=is_local_index_global_attn_nonzero,
|
||||
@@ -658,7 +664,7 @@ class LongformerSelfAttention(nn.Module):
|
||||
else:
|
||||
# compute local attn only
|
||||
attn_output = self._sliding_chunks_matmul_attn_probs_value(
|
||||
local_attn_probs, value_vectors, self.one_sided_attn_window_size
|
||||
attn_probs, value_vectors, self.one_sided_attn_window_size
|
||||
)
|
||||
|
||||
assert attn_output.size() == (batch_size, seq_len, self.num_heads, self.head_dim), "Unexpected size"
|
||||
@@ -688,11 +694,14 @@ class LongformerSelfAttention(nn.Module):
|
||||
# The attention weights for tokens with global attention are
|
||||
# just filler values, they were never used to compute the output.
|
||||
# Fill with 0 now, the correct values are in 'global_attn_probs'.
|
||||
local_attn_probs[is_index_global_attn_nonzero] = 0
|
||||
attn_probs[is_index_global_attn_nonzero] = 0
|
||||
|
||||
outputs = (attn_output.transpose(0, 1), local_attn_probs)
|
||||
outputs = (attn_output.transpose(0, 1),)
|
||||
|
||||
return outputs + (global_attn_probs,) if is_global_attn else outputs
|
||||
if output_attentions:
|
||||
outputs += (attn_probs,)
|
||||
|
||||
return outputs + (global_attn_probs,) if (is_global_attn and output_attentions) else outputs
|
||||
|
||||
@staticmethod
|
||||
def _pad_and_transpose_last_two_dims(hidden_states_padded, padding):
|
||||
@@ -711,6 +720,7 @@ class LongformerSelfAttention(nn.Module):
|
||||
shift every row 1 step right, converting columns into diagonals.
|
||||
|
||||
Example::
|
||||
|
||||
chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
|
||||
-1.8348, 0.7672, 0.2986, 0.0285,
|
||||
-0.7584, 0.4206, -0.0405, 0.1599,
|
||||
@@ -728,13 +738,13 @@ class LongformerSelfAttention(nn.Module):
|
||||
) # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten
|
||||
chunked_hidden_states = chunked_hidden_states.view(
|
||||
total_num_heads, num_chunks, -1
|
||||
) # total_num_heads x num_chunks x window_overlapL+window_overlapwindow_overlap+window_overlap
|
||||
) # total_num_heads x num_chunks x window_overlap*window_overlap+window_overlap
|
||||
chunked_hidden_states = chunked_hidden_states[
|
||||
:, :, :-window_overlap
|
||||
] # total_num_heads x num_chunks x window_overlapL+window_overlapwindow_overlap
|
||||
] # total_num_heads x num_chunks x window_overlap*window_overlap
|
||||
chunked_hidden_states = chunked_hidden_states.view(
|
||||
total_num_heads, num_chunks, window_overlap, window_overlap + hidden_dim
|
||||
) # total_num_heads x num_chunks, window_overlap x hidden_dim+window_overlap
|
||||
)
|
||||
chunked_hidden_states = chunked_hidden_states[:, :, :, :-1]
|
||||
return chunked_hidden_states
|
||||
|
||||
@@ -788,18 +798,18 @@ class LongformerSelfAttention(nn.Module):
|
||||
query = query.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
|
||||
key = key.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim)
|
||||
|
||||
chunked_query = self._chunk(query, window_overlap)
|
||||
chunked_key = self._chunk(key, window_overlap)
|
||||
query = self._chunk(query, window_overlap)
|
||||
key = self._chunk(key, window_overlap)
|
||||
|
||||
# matrix multiplication
|
||||
# bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
|
||||
# bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
|
||||
# bcxy: batch_size * num_heads x chunks x 2window_overlap x window_overlap
|
||||
chunked_attention_scores = torch.einsum("bcxd,bcyd->bcxy", (chunked_query, chunked_key)) # multiply
|
||||
diagonal_chunked_attention_scores = torch.einsum("bcxd,bcyd->bcxy", (query, key)) # multiply
|
||||
|
||||
# convert diagonals into columns
|
||||
diagonal_chunked_attention_scores = self._pad_and_transpose_last_two_dims(
|
||||
chunked_attention_scores, padding=(0, 0, 0, 1)
|
||||
diagonal_chunked_attention_scores, padding=(0, 0, 0, 1)
|
||||
)
|
||||
|
||||
# allocate space for the overall attention matrix where the chunks are combined. The last dimension
|
||||
@@ -1095,7 +1105,13 @@ class LongformerAttention(nn.Module):
|
||||
self.pruned_heads = self.pruned_heads.union(heads)
|
||||
|
||||
def forward(
|
||||
self, hidden_states, attention_mask=None, is_index_masked=None, is_index_global_attn=None, is_global_attn=None
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
is_index_masked=None,
|
||||
is_index_global_attn=None,
|
||||
is_global_attn=None,
|
||||
output_attentions=False,
|
||||
):
|
||||
self_outputs = self.self(
|
||||
hidden_states,
|
||||
@@ -1103,6 +1119,7 @@ class LongformerAttention(nn.Module):
|
||||
is_index_masked=is_index_masked,
|
||||
is_index_global_attn=is_index_global_attn,
|
||||
is_global_attn=is_global_attn,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
attn_output = self.output(self_outputs[0], hidden_states)
|
||||
outputs = (attn_output,) + self_outputs[1:]
|
||||
@@ -1150,7 +1167,13 @@ class LongformerLayer(nn.Module):
|
||||
self.seq_len_dim = 1
|
||||
|
||||
def forward(
|
||||
self, hidden_states, attention_mask=None, is_index_masked=None, is_index_global_attn=None, is_global_attn=None
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
is_index_masked=None,
|
||||
is_index_global_attn=None,
|
||||
is_global_attn=None,
|
||||
output_attentions=False,
|
||||
):
|
||||
self_attn_outputs = self.attention(
|
||||
hidden_states,
|
||||
@@ -1158,6 +1181,7 @@ class LongformerLayer(nn.Module):
|
||||
is_index_masked=is_index_masked,
|
||||
is_index_global_attn=is_index_global_attn,
|
||||
is_global_attn=is_global_attn,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
attn_output = self_attn_outputs[0]
|
||||
outputs = self_attn_outputs[1:]
|
||||
@@ -1205,7 +1229,7 @@ class LongformerEncoder(nn.Module):
|
||||
|
||||
def create_custom_forward(module):
|
||||
def custom_forward(*inputs):
|
||||
return module(*inputs, is_global_attn)
|
||||
return module(*inputs, is_global_attn, output_attentions)
|
||||
|
||||
return custom_forward
|
||||
|
||||
@@ -1223,6 +1247,7 @@ class LongformerEncoder(nn.Module):
|
||||
is_index_masked=is_index_masked,
|
||||
is_index_global_attn=is_index_global_attn,
|
||||
is_global_attn=is_global_attn,
|
||||
output_attentions=output_attentions,
|
||||
)
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
|
||||
@@ -796,7 +796,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
|
||||
)
|
||||
|
||||
# normalize query
|
||||
query_vectors /= tf.math.sqrt(tf.constant(self.head_dim, dtype=tf.dtypes.float32))
|
||||
query_vectors /= tf.math.sqrt(tf.convert_to_tensor(self.head_dim, dtype=tf.dtypes.float32))
|
||||
query_vectors = tf.reshape(query_vectors, (batch_size, seq_len, self.num_heads, self.head_dim))
|
||||
key_vectors = tf.reshape(key_vectors, (batch_size, seq_len, self.num_heads, self.head_dim))
|
||||
|
||||
@@ -945,7 +945,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
|
||||
chunked_attention_scores = tf.einsum("bcxd,bcyd->bcxy", chunked_query, chunked_key) # multiply
|
||||
|
||||
# convert diagonals into columns
|
||||
paddings = tf.constant([[0, 0], [0, 0], [0, 1], [0, 0]], dtype=tf.dtypes.int32)
|
||||
paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 1], [0, 0]], dtype=tf.dtypes.int32)
|
||||
diagonal_chunked_attention_scores = self._pad_and_transpose_last_two_dims(chunked_attention_scores, paddings)
|
||||
|
||||
# allocate space for the overall attention matrix where the chunks are combined. The last dimension
|
||||
@@ -1093,7 +1093,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
|
||||
)
|
||||
|
||||
# pad seq_len with w at the beginning of the sequence and another window overlap at the end
|
||||
paddings = tf.constant([[0, 0], [window_overlap, window_overlap], [0, 0]], dtype=tf.dtypes.int32)
|
||||
paddings = tf.convert_to_tensor([[0, 0], [window_overlap, window_overlap], [0, 0]], dtype=tf.dtypes.int32)
|
||||
padded_value = tf.pad(value, paddings, constant_values=-1)
|
||||
|
||||
# chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap
|
||||
@@ -1141,6 +1141,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
|
||||
shift every row 1 step right, converting columns into diagonals.
|
||||
|
||||
Example::
|
||||
|
||||
chunked_hidden_states: [ 0.4983, 2.6918, -0.0071, 1.0492,
|
||||
-1.8348, 0.7672, 0.2986, 0.0285,
|
||||
-0.7584, 0.4206, -0.0405, 0.1599,
|
||||
@@ -1153,7 +1154,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
|
||||
0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
|
||||
"""
|
||||
total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states)
|
||||
paddings = tf.constant([[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]])
|
||||
paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]])
|
||||
chunked_hidden_states = tf.pad(
|
||||
chunked_hidden_states, paddings
|
||||
) # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten
|
||||
@@ -1349,7 +1350,7 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
|
||||
global_value_vectors = self.value_global(hidden_states)
|
||||
|
||||
# normalize
|
||||
global_query_vectors_only_global /= tf.math.sqrt(tf.constant(self.head_dim, dtype=tf.dtypes.float32))
|
||||
global_query_vectors_only_global /= tf.math.sqrt(tf.convert_to_tensor(self.head_dim, dtype=tf.dtypes.float32))
|
||||
global_query_vectors_only_global = self.reshape_and_transpose(global_query_vectors_only_global, batch_size)
|
||||
global_key_vectors = self.reshape_and_transpose(global_key_vectors, batch_size)
|
||||
global_value_vectors = self.reshape_and_transpose(global_value_vectors, batch_size)
|
||||
@@ -1820,10 +1821,10 @@ class TFLongformerPreTrainedModel(TFPreTrainedModel):
|
||||
|
||||
@property
|
||||
def dummy_inputs(self):
|
||||
input_ids = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
|
||||
input_ids = tf.convert_to_tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
|
||||
# make sure global layers are initialized
|
||||
attention_mask = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
|
||||
global_attention_mask = tf.constant([[0, 0, 0, 0, 1], [0, 0, 1, 0, 0], [0, 0, 0, 0, 1]])
|
||||
attention_mask = tf.convert_to_tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
|
||||
global_attention_mask = tf.convert_to_tensor([[0, 0, 0, 0, 1], [0, 0, 1, 0, 0], [0, 0, 0, 0, 1]])
|
||||
return {
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
@@ -2371,9 +2372,9 @@ class TFLongformerForMultipleChoice(TFLongformerPreTrainedModel, TFMultipleChoic
|
||||
|
||||
@property
|
||||
def dummy_inputs(self):
|
||||
input_ids = tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)
|
||||
input_ids = tf.convert_to_tensor(MULTIPLE_CHOICE_DUMMY_INPUTS)
|
||||
# make sure global layers are initialized
|
||||
global_attention_mask = tf.constant([[[0, 0, 0, 1], [0, 0, 0, 1]]] * 2)
|
||||
global_attention_mask = tf.convert_to_tensor([[[0, 0, 0, 1], [0, 0, 0, 1]]] * 2)
|
||||
return {"input_ids": input_ids, "global_attention_mask": global_attention_mask}
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
|
||||
@@ -1179,6 +1179,45 @@ class LayoutLMModel:
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
LED_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
class LEDForConditionalGeneration:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class LEDForQuestionAnswering:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class LEDForSequenceClassification:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class LEDModel:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
|
||||
@@ -827,6 +827,33 @@ class TFGPT2PreTrainedModel:
|
||||
requires_tf(self)
|
||||
|
||||
|
||||
class TFLEDForConditionalGeneration:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_tf(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_tf(self)
|
||||
|
||||
|
||||
class TFLEDModel:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_tf(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_tf(self)
|
||||
|
||||
|
||||
class TFLEDPreTrainedModel:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_tf(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_tf(self)
|
||||
|
||||
|
||||
TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
|
||||
@@ -128,6 +128,15 @@ class LayoutLMTokenizerFast:
|
||||
requires_tokenizers(self)
|
||||
|
||||
|
||||
class LEDTokenizerFast:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_tokenizers(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_tokenizers(self)
|
||||
|
||||
|
||||
class LongformerTokenizerFast:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_tokenizers(self)
|
||||
|
||||
Reference in New Issue
Block a user