Funnel transformer (#6908)

* Initial model

* Fix upsampling

* Add special cls token id and test

* Formatting

* Test and fist FunnelTokenizerFast

* Common tests

* Fix the check_repo script and document Funnel

* Doc fixes

* Add all models

* Write doc

* Fix test

* Initial model

* Fix upsampling

* Add special cls token id and test

* Formatting

* Test and fist FunnelTokenizerFast

* Common tests

* Fix the check_repo script and document Funnel

* Doc fixes

* Add all models

* Write doc

* Fix test

* Fix copyright

* Forgot some layers can be repeated

* Apply suggestions from code review

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update src/transformers/modeling_funnel.py

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

* Address review comments

* Update src/transformers/modeling_funnel.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Address review comments

* Update src/transformers/modeling_funnel.py

Co-authored-by: Sam Shleifer <sshleifer@gmail.com>

* Slow integration test

* Make small integration test

* Formatting

* Add checkpoint and separate classification head

* Formatting

* Expand list, fix link and add in pretrained models

* Styling

* Add the model in all summaries

* Typo fixes

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Sam Shleifer <sshleifer@gmail.com>
This commit is contained in:
Sylvain Gugger
2020-09-08 08:08:08 -04:00
committed by GitHub
parent 25afb4ea50
commit d155b38d6e
18 changed files with 3208 additions and 405 deletions

View File

@@ -29,6 +29,7 @@ from .configuration_dpr import DPR_PRETRAINED_CONFIG_ARCHIVE_MAP, DPRConfig
from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
from .configuration_encoder_decoder import EncoderDecoderConfig
from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
from .configuration_funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig
from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
from .configuration_longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig
from .configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig
@@ -155,6 +156,7 @@ from .tokenization_dpr import (
)
from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
from .tokenization_flaubert import FlaubertTokenizer
from .tokenization_funnel import FunnelTokenizer, FunnelTokenizerFast
from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast
from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
@@ -327,6 +329,18 @@ if is_torch_available():
FlaubertModel,
FlaubertWithLMHeadModel,
)
from .modeling_funnel import (
FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST,
FunnelBaseModel,
FunnelForMaskedLM,
FunnelForMultipleChoice,
FunnelForPreTraining,
FunnelForQuestionAnswering,
FunnelForSequenceClassification,
FunnelForTokenClassification,
FunnelModel,
load_tf_weights_in_funnel,
)
from .modeling_gpt2 import (
GPT2_PRETRAINED_MODEL_ARCHIVE_LIST,
GPT2DoubleHeadsModel,

View File

@@ -15,6 +15,12 @@ def convert_command_factory(args: Namespace):
)
IMPORT_ERROR_MESSAGE = """transformers can only be used from the commandline to convert TensorFlow models in PyTorch,
In that case, it requires TensorFlow to be installed. Please see
https://www.tensorflow.org/install/ for installation instructions.
"""
class ConvertCommand(BaseTransformersCLICommand):
@staticmethod
def register_subcommand(parser: ArgumentParser):
@@ -69,12 +75,7 @@ class ConvertCommand(BaseTransformersCLICommand):
convert_tf_checkpoint_to_pytorch,
)
except ImportError:
msg = (
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
"In that case, it requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise ImportError(msg)
raise ImportError(IMPORT_ERROR_MESSAGE)
convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
elif self._model_type == "bert":
@@ -83,12 +84,16 @@ class ConvertCommand(BaseTransformersCLICommand):
convert_tf_checkpoint_to_pytorch,
)
except ImportError:
msg = (
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
"In that case, it requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
raise ImportError(IMPORT_ERROR_MESSAGE)
convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
elif self._model_type == "funnel":
try:
from transformers.convert_funnel_original_tf_checkpoint_to_pytorch import (
convert_tf_checkpoint_to_pytorch,
)
raise ImportError(msg)
except ImportError:
raise ImportError(IMPORT_ERROR_MESSAGE)
convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
elif self._model_type == "gpt":
@@ -103,12 +108,7 @@ class ConvertCommand(BaseTransformersCLICommand):
convert_transfo_xl_checkpoint_to_pytorch,
)
except ImportError:
msg = (
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
"In that case, it requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise ImportError(msg)
raise ImportError(IMPORT_ERROR_MESSAGE)
if "ckpt" in self._tf_checkpoint.lower():
TF_CHECKPOINT = self._tf_checkpoint
@@ -125,12 +125,7 @@ class ConvertCommand(BaseTransformersCLICommand):
convert_gpt2_checkpoint_to_pytorch,
)
except ImportError:
msg = (
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
"In that case, it requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise ImportError(msg)
raise ImportError(IMPORT_ERROR_MESSAGE)
convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
elif self._model_type == "xlnet":
@@ -139,12 +134,7 @@ class ConvertCommand(BaseTransformersCLICommand):
convert_xlnet_checkpoint_to_pytorch,
)
except ImportError:
msg = (
"transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
"In that case, it requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise ImportError(msg)
raise ImportError(IMPORT_ERROR_MESSAGE)
convert_xlnet_checkpoint_to_pytorch(
self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name

View File

@@ -26,6 +26,7 @@ from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
from .configuration_encoder_decoder import EncoderDecoderConfig
from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
from .configuration_funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig
from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
from .configuration_longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig
from .configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig
@@ -67,6 +68,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP,
LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
]
for key, value, in pretrained_map.items()
@@ -168,6 +170,10 @@ CONFIG_MAPPING = OrderedDict(
"encoder-decoder",
EncoderDecoderConfig,
),
(
"funnel",
FunnelConfig,
),
(
"lxmert",
LxmertConfig,
@@ -230,6 +236,7 @@ class AutoConfig:
- `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
- `flaubert` : :class:`~transformers.FlaubertConfig` (Flaubert model)
- `electra` : :class:`~transformers.ElectraConfig` (ELECTRA model)
- `funnel`: :class:`~transformers.FunnelConfig` (Funnel Transformer model)
Args:
pretrained_model_name_or_path (:obj:`string`):

View File

@@ -0,0 +1,183 @@
# coding=utf-8
# Copyright 2020, Hugging Face
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Funnel Transformer model configuration """
from .configuration_utils import PretrainedConfig
from .utils import logging
logger = logging.get_logger(__name__)
FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"funnel-transformer/small": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/small/config.json",
"funnel-transformer/small-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/small-base/config.json",
"funnel-transformer/medium": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/medium/config.json",
"funnel-transformer/medium-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/medium-base/config.json",
"funnel-transformer/intermediate": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/intermediate/config.json",
"funnel-transformer/intermediate-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/intermediate-base/config.json",
"funnel-transformer/large": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/large/config.json",
"funnel-transformer/large-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/large-base/config.json",
"funnel-transformer/xlarge": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/xlarge/config.json",
"funnel-transformer/xlarge-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/xlarge-base/config.json",
}
class FunnelConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.FunnelModel`.
It is used to instantiate an Funnel Transformer model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the Funnel Transformer `funnel-transformer/small <https://huggingface.co/funnel-transformer/small>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, `optional`, defaults to 30522):
Vocabulary size of the Funnel transformer. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.FunnelModel`.
block_sizes (:obj:`List[int]`, `optional`, defaults to :obj:`[4, 4, 4]`):
The sizes of the blocks used in the model.
block_repeats (:obj:`List[int]`, `optional`):
If passed along, each layer of each block is repeated the number of times indicated.
num_decoder_layers (:obj:`int`, `optional`, defaults to 2):
The number of layers in the decoder (when not using the base model).
d_model (:obj:`int`, `optional`, defaults to 768):
Dimensionality of the model's hidden states.
n_head (:obj:`int`, `optional`, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
d_head (:obj:`int`, `optional`, defaults to 64):
Dimensionality of the model's heads.
d_inner (:obj:`int`, `optional`, defaults to 3072):
Inner dimension in the feed-forward blocks.
hidden_act (:obj:`str` or :obj:`callable`, `optional`, defaults to :obj:`"gelu_new"`):
The non-linear activation function (function or string) in the encoder and pooler.
If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
The dropout probability for the attention probabilities.
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
The dropout probability used between the two layers of the feed-forward blocks.
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, `optional`, defaults to 3):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.FunnelModel`.
initializer_range (:obj:`float`, `optional`, defaults to 0.1):
The standard deviation of the `uniform initializer` for initializing all weight matrices in attention
layers.
initializer_std (:obj:`float`, `optional`):
The standard deviation of the `normal initializer` for initializing the embedding matrix and the weight of
linear layers. Will default to 1 for the embedding matrix and the value given by Xavier initialization for
linear layers.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-9):
The epsilon used by the layer normalization layers.
pooling_type (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
Possible values are ``"mean"`` or ``"max"``. The way pooling is performed at the beginning of each
block.
attention_type (:obj:`str`, `optional`, defaults to :obj:`"relative_shift"`):
Possible values are ``"relative_shift"`` or ``"factorized"``. The former is faster on CPU/GPU while
the latter is faster on TPU.
separate_cls (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to separate the cls token when applying pooling.
truncate_seq (:obj:`bool`, `optional`, defaults to :obj:`False`):
When using ``separate_cls``, whether or not to truncate the last token when pooling, to avoid getting
a sequence length that is not a multiple of 2.
pool_q_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to apply the pooling only to the query or to query, key and values for the attention
layers.
"""
model_type = "funnel"
def __init__(
self,
vocab_size=30522,
block_sizes=[4, 4, 4],
block_repeats=None,
num_decoder_layers=2,
d_model=768,
n_head=12,
d_head=64,
d_inner=3072,
hidden_act="gelu_new",
hidden_dropout=0.1,
attention_dropout=0.1,
activation_dropout=0.0,
max_position_embeddings=512,
type_vocab_size=3,
initializer_range=0.1,
initializer_std=None,
layer_norm_eps=1e-9,
pooling_type="mean",
attention_type="relative_shift",
separate_cls=True,
truncate_seq=True,
pool_q_only=True,
**kwargs
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.block_sizes = block_sizes
self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
assert len(block_sizes) == len(
self.block_repeats
), "`block_sizes` and `block_repeats` should have the same length."
self.num_decoder_layers = num_decoder_layers
self.d_model = d_model
self.n_head = n_head
self.d_head = d_head
self.d_inner = d_inner
self.hidden_act = hidden_act
self.hidden_dropout = hidden_dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.initializer_std = initializer_std
self.layer_norm_eps = layer_norm_eps
assert pooling_type in [
"mean",
"max",
], f"Got {pooling_type} for `pooling_type` but only 'mean' and 'max' are supported."
self.pooling_type = pooling_type
assert attention_type in [
"relative_shift",
"factorized",
], f"Got {attention_type} for `attention_type` but only 'relative_shift' and 'factorized' are supported."
self.attention_type = attention_type
self.separate_cls = separate_cls
self.truncate_seq = truncate_seq
self.pool_q_only = pool_q_only
@property
def hidden_size(self):
return self.d_model
@property
def num_attention_heads(self):
return self.n_head
@property
def num_hidden_layers(self):
return sum(self.block_sizes)
@property
def num_blocks(self):
return len(self.block_sizes)

View File

@@ -0,0 +1,61 @@
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Funnel checkpoint."""
import argparse
import logging
import torch
from transformers import FunnelConfig, FunnelForPreTraining, load_tf_weights_in_funnel
logging.basicConfig(level=logging.INFO)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
# Initialise PyTorch model
config = FunnelConfig.from_json_file(config_file)
print("Building PyTorch model from configuration: {}".format(str(config)))
model = FunnelForPreTraining(config)
# Load weights from tf checkpoint
load_tf_weights_in_funnel(model, config, tf_checkpoint_path)
# Save pytorch-model
print("Save PyTorch model to {}".format(pytorch_dump_path))
torch.save(model.state_dict(), pytorch_dump_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument(
"--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
)
parser.add_argument(
"--config_file",
default=None,
type=str,
required=True,
help="The config json file corresponding to the pre-trained model. \n"
"This specifies the model architecture.",
)
parser.add_argument(
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
)
args = parser.parse_args()
convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)

View File

@@ -29,6 +29,7 @@ from .configuration_auto import (
ElectraConfig,
EncoderDecoderConfig,
FlaubertConfig,
FunnelConfig,
GPT2Config,
LongformerConfig,
LxmertConfig,
@@ -108,6 +109,14 @@ from .modeling_flaubert import (
FlaubertModel,
FlaubertWithLMHeadModel,
)
from .modeling_funnel import (
FunnelForMaskedLM,
FunnelForMultipleChoice,
FunnelForQuestionAnswering,
FunnelForSequenceClassification,
FunnelForTokenClassification,
FunnelModel,
)
from .modeling_gpt2 import GPT2LMHeadModel, GPT2Model
from .modeling_longformer import (
LongformerForMaskedLM,
@@ -202,6 +211,7 @@ MODEL_MAPPING = OrderedDict(
(CTRLConfig, CTRLModel),
(ElectraConfig, ElectraModel),
(ReformerConfig, ReformerModel),
(FunnelConfig, FunnelModel),
(LxmertConfig, LxmertModel),
]
)
@@ -254,6 +264,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
(ElectraConfig, ElectraForMaskedLM),
(EncoderDecoderConfig, EncoderDecoderModel),
(ReformerConfig, ReformerModelWithLMHead),
(FunnelConfig, FunnelForMaskedLM),
]
)
@@ -291,6 +302,7 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
(XLMConfig, XLMWithLMHeadModel),
(ElectraConfig, ElectraForMaskedLM),
(ReformerConfig, ReformerForMaskedLM),
(FunnelConfig, FunnelForMaskedLM),
]
)
@@ -320,6 +332,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
(FlaubertConfig, FlaubertForSequenceClassification),
(XLMConfig, XLMForSequenceClassification),
(ElectraConfig, ElectraForSequenceClassification),
(FunnelConfig, FunnelForSequenceClassification),
]
)
@@ -339,6 +352,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
(XLMConfig, XLMForQuestionAnsweringSimple),
(ElectraConfig, ElectraForQuestionAnswering),
(ReformerConfig, ReformerForQuestionAnswering),
(FunnelConfig, FunnelForQuestionAnswering),
]
)
@@ -357,6 +371,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
(AlbertConfig, AlbertForTokenClassification),
(ElectraConfig, ElectraForTokenClassification),
(FlaubertConfig, FlaubertForTokenClassification),
(FunnelConfig, FunnelForTokenClassification),
]
)
@@ -374,6 +389,7 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
(AlbertConfig, AlbertForMultipleChoice),
(XLMConfig, XLMForMultipleChoice),
(FlaubertConfig, FlaubertForMultipleChoice),
(FunnelConfig, FunnelForMultipleChoice),
]
)
@@ -421,6 +437,7 @@ class AutoModel:
- isInstance of `xlm` configuration class: :class:`~transformers.XLMModel` (XLM model)
- isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertModel` (Flaubert model)
- isInstance of `electra` configuration class: :class:`~transformers.ElectraModel` (Electra model)
- isInstance of `funnel` configuration class: :class:`~transformers.FunnelModel` (Funnel Transformer model)
Examples::
@@ -462,6 +479,7 @@ class AutoModel:
- `ctrl`: :class:`~transformers.CTRLModel` (Salesforce CTRL model)
- `flaubert`: :class:`~transformers.FlaubertModel` (Flaubert model)
- `electra`: :class:`~transformers.ElectraModel` (Electra model)
- `funnel`: :class:`~transformers.FunnelModel` (Funnel Transformer model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
@@ -729,6 +747,7 @@ class AutoModelWithLMHead:
- isInstance of `xlm` configuration class: :class:`~transformers.XLMWithLMHeadModel` (XLM model)
- isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
- isInstance of `electra` configuration class: :class:`~transformers.ElectraForMaskedLM` (Electra model)
- isInstance of `funnel` configuration class: :class:`~transformers.FunnelForMaskedLM` (Funnel Transformer model)
Examples::
@@ -774,6 +793,7 @@ class AutoModelWithLMHead:
- `ctrl`: :class:`~transformers.CTRLLMHeadModel` (Salesforce CTRL model)
- `flaubert`: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
- `electra`: :class:`~transformers.ElectraForMaskedLM` (Electra model)
- `funnel`: :class:`~transformers.FunnelForMaskedLM` (Funnel Transformer model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
@@ -1024,6 +1044,7 @@ class AutoModelForMaskedLM:
- isInstance of `electra` configuration class: :class:`~transformers.ElectraForMaskedLM` (Electra model)
- isInstance of `camembert` configuration class: :class:`~transformers.CamembertForMaskedLM` (Camembert model)
- isInstance of `albert` configuration class: :class:`~transformers.AlbertForMaskedLM` (Albert model)
- isInstance of `funnel` configuration class: :class:`~transformers.FunnelForMaskedLM` (Funnel Transformer model)
Examples::
@@ -1060,6 +1081,7 @@ class AutoModelForMaskedLM:
- `flaubert`: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
- `electra`: :class:`~transformers.ElectraForMaskedLM` (Electra model)
- `bert`: :class:`~transformers.BertLMHeadModel` (Bert model)
- `funnel`: :class:`~transformers.FunnelForMaskedLM` (Funnel Transformer model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
@@ -1304,7 +1326,7 @@ class AutoModelForSequenceClassification:
- isInstance of `xlnet` configuration class: :class:`~transformers.XLNetForSequenceClassification` (XLNet model)
- isInstance of `xlm` configuration class: :class:`~transformers.XLMForSequenceClassification` (XLM model)
- isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertForSequenceClassification` (Flaubert model)
- isInstance of `funnel` configuration class: :class:`~transformers.FunnelModelForSequenceClassification` (Funnel Transformer model)
Examples::
@@ -1340,6 +1362,7 @@ class AutoModelForSequenceClassification:
- `bert`: :class:`~transformers.BertForSequenceClassification` (Bert model)
- `xlnet`: :class:`~transformers.XLNetForSequenceClassification` (XLNet model)
- `flaubert`: :class:`~transformers.FlaubertForSequenceClassification` (Flaubert model)
- `funnel`: :class:`~transformers.FunnelForSequenceClassification` (Funnel Transformer model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
@@ -1454,6 +1477,7 @@ class AutoModelForQuestionAnswering:
- isInstance of `xlnet` configuration class: :class:`~transformers.XLNetForQuestionAnswering` (XLNet model)
- isInstance of `xlm` configuration class: :class:`~transformers.XLMForQuestionAnswering` (XLM model)
- isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertForQuestionAnswering` (XLM model)
- isInstance of `funnel` configuration class: :class:`~transformers.FunnelForQuestionAnswering` (Funnel Transformer model)
Examples::
@@ -1488,6 +1512,7 @@ class AutoModelForQuestionAnswering:
- `xlnet`: :class:`~transformers.XLNetForQuestionAnswering` (XLNet model)
- `xlm`: :class:`~transformers.XLMForQuestionAnswering` (XLM model)
- `flaubert`: :class:`~transformers.FlaubertForQuestionAnswering` (XLM model)
- `funnel`: :class:`~transformers.FunnelForQuestionAnswering` (Funnel Transformer model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`
@@ -1604,6 +1629,7 @@ class AutoModelForTokenClassification:
- isInstance of `camembert` configuration class: :class:`~transformers.CamembertModelForTokenClassification` (Camembert model)
- isInstance of `roberta` configuration class: :class:`~transformers.RobertaModelForTokenClassification` (Roberta model)
- isInstance of `electra` configuration class: :class:`~transformers.ElectraForTokenClassification` (Electra model)
- isInstance of `funnel` configuration class: :class:`~transformers.FunnelForTokenClassification` (Funnel Transformer model)
Examples::
@@ -1641,6 +1667,7 @@ class AutoModelForTokenClassification:
- `flaubert`: :class:`~transformers.FlaubertForTokenClassification` (Flaubert model)
- `roberta`: :class:`~transformers.RobertaForTokenClassification` (Roberta model)
- `electra`: :class:`~transformers.ElectraForTokenClassification` (Electra model)
- `funnel`: :class:`~transformers.FunnelForTokenClassification` (Funnel Transformer model)
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with `model.train()`

File diff suppressed because it is too large Load Diff

View File

@@ -27,6 +27,7 @@ from .configuration_auto import (
DistilBertConfig,
ElectraConfig,
FlaubertConfig,
FunnelConfig,
GPT2Config,
LongformerConfig,
LxmertConfig,
@@ -54,6 +55,7 @@ from .tokenization_ctrl import CTRLTokenizer
from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
from .tokenization_flaubert import FlaubertTokenizer
from .tokenization_funnel import FunnelTokenizer, FunnelTokenizerFast
from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast
from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
@@ -93,6 +95,7 @@ TOKENIZER_MAPPING = OrderedDict(
(RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
(ReformerConfig, (ReformerTokenizer, None)),
(ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)),
(FunnelConfig, (FunnelTokenizer, FunnelTokenizerFast)),
(LxmertConfig, (LxmertTokenizer, LxmertTokenizerFast)),
(BertConfig, (BertTokenizer, BertTokenizerFast)),
(OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)),
@@ -131,6 +134,7 @@ class AutoTokenizer:
- `xlm`: XLMTokenizer (XLM model)
- `ctrl`: CTRLTokenizer (Salesforce CTRL model)
- `electra`: ElectraTokenizer (Google ELECTRA model)
- `funnel`: FunnelTokenizer (Funnel Transformer model)
- `lxmert`: LxmertTokenizer (Lxmert model)
This class cannot be instantiated using `__init__()` (throw an error).
@@ -167,6 +171,7 @@ class AutoTokenizer:
- `xlm`: XLMTokenizer (XLM model)
- `ctrl`: CTRLTokenizer (Salesforce CTRL model)
- `electra`: ElectraTokenizer (Google ELECTRA model)
- `funnel`: FunnelTokenizer (Funnel Transformer model)
- `lxmert`: LxmertTokenizer (Lxmert model)
Params:

View File

@@ -0,0 +1,232 @@
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization class for Funnel Transformer."""
from typing import List, Optional
from .tokenization_bert import BertTokenizer, BertTokenizerFast
from .utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
_model_names = [
"small",
"small-base",
"medium",
"medium-base",
"intermediate",
"intermediate-base",
"large",
"large-base",
"xlarge",
"xlarge-base",
]
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"funnel-transformer/small": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/small/vocab.txt",
"funnel-transformer/small-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/small-base/vocab.txt",
"funnel-transformer/medium": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/medium/vocab.txt",
"funnel-transformer/medium-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/medium-base/vocab.txt",
"funnel-transformer/intermediate": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/intermediate/vocab.txt",
"funnel-transformer/intermediate-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/intermediate-base/vocab.txt",
"funnel-transformer/large": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/large/vocab.txt",
"funnel-transformer/large-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/large-base/vocab.txt",
"funnel-transformer/xlarge": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/xlarge/vocab.txt",
"funnel-transformer/xlarge-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/xlarge-base/vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {f"funnel-transformer/{name}": 512 for name in _model_names}
PRETRAINED_INIT_CONFIGURATION = {f"funnel-transformer/{name}": {"do_lower_case": True} for name in _model_names}
class FunnelTokenizer(BertTokenizer):
r"""
Tokenizer for the Funnel Transformer models.
:class:`~transformers.FunnelTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
cls_token_type_id: int = 2
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="<unk>",
sep_token="<sep>",
pad_token="<pad>",
cls_token="<cls>",
mask_token="<mask>",
bos_token="<s>",
eos_token="</s>",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs
):
super().__init__(
vocab_file,
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
bos_token=bos_token,
eos_token=eos_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Funnel Transformer expects a sequence pair mask that has the following format:
::
2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0's).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
class FunnelTokenizerFast(BertTokenizerFast):
r"""
"Fast" tokenizer for the Funnel Transformer models (backed by HuggingFace's :obj:`tokenizers` library).
:class:`~transformers.FunnelTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting + wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
cls_token_type_id: int = 2
def __init__(
self,
vocab_file,
do_lower_case=True,
unk_token="<unk>",
sep_token="<sep>",
pad_token="<pad>",
cls_token="<cls>",
mask_token="<mask>",
bos_token="<s>",
eos_token="</s>",
clean_text=True,
tokenize_chinese_chars=True,
strip_accents=None,
wordpieces_prefix="##",
**kwargs
):
super().__init__(
vocab_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
bos_token=bos_token,
eos_token=eos_token,
clean_text=clean_text,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
wordpieces_prefix=wordpieces_prefix,
**kwargs,
)
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Funnel Transformer expects a sequence pair mask that has the following format:
::
2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0's).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def _convert_encoding(self, encoding, **kwargs):
# The fast tokenizer doesn't use the function above so we fix the cls token type id when decoding the fast
# tokenzier output.
encoding_dict = super()._convert_encoding(encoding, **kwargs)
if "token_type_ids" in encoding_dict:
# Note: we can't assume the <cls> token is in first position because left padding is a thing, hence the
# double list comprehension.
encoding_dict["token_type_ids"] = [
[self.cls_token_type_id if i == self.cls_token_id else t for i, t in zip(input_ids, type_ids)]
for input_ids, type_ids in zip(encoding_dict["input_ids"], encoding_dict["token_type_ids"])
]
return encoding_dict