Reorganize repo (#8580)
* Put models in subfolders * Styling * Fix imports in tests * More fixes in test imports * Sneaky hidden imports * Fix imports in doc files * More sneaky imports * Finish fixing tests * Fix examples * Fix path for copies * More fixes for examples * Fix dummy files * More fixes for example * More model import fixes * Is this why you're unhappy GitHub? * Fix imports in conver command
This commit is contained in:
39
src/transformers/models/roberta/__init__.py
Normal file
39
src/transformers/models/roberta/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
from ...file_utils import is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
|
||||
from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
|
||||
from .tokenization_roberta import RobertaTokenizer
|
||||
|
||||
|
||||
if is_tokenizers_available():
|
||||
from .tokenization_roberta_fast import RobertaTokenizerFast
|
||||
|
||||
if is_torch_available():
|
||||
from .modeling_roberta import (
|
||||
ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
RobertaForCausalLM,
|
||||
RobertaForMaskedLM,
|
||||
RobertaForMultipleChoice,
|
||||
RobertaForQuestionAnswering,
|
||||
RobertaForSequenceClassification,
|
||||
RobertaForTokenClassification,
|
||||
RobertaModel,
|
||||
)
|
||||
|
||||
if is_tf_available():
|
||||
from .modeling_tf_roberta import (
|
||||
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
TFRobertaForMaskedLM,
|
||||
TFRobertaForMultipleChoice,
|
||||
TFRobertaForQuestionAnswering,
|
||||
TFRobertaForSequenceClassification,
|
||||
TFRobertaForTokenClassification,
|
||||
TFRobertaMainLayer,
|
||||
TFRobertaModel,
|
||||
TFRobertaPreTrainedModel,
|
||||
)
|
||||
|
||||
if is_flax_available():
|
||||
from .modeling_flax_roberta import FlaxRobertaModel
|
||||
64
src/transformers/models/roberta/configuration_roberta.py
Normal file
64
src/transformers/models/roberta/configuration_roberta.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" RoBERTa configuration """
|
||||
|
||||
from ...utils import logging
|
||||
from ..bert.configuration_bert import BertConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json",
|
||||
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json",
|
||||
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json",
|
||||
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json",
|
||||
"roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json",
|
||||
"roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json",
|
||||
}
|
||||
|
||||
|
||||
class RobertaConfig(BertConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a
|
||||
:class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified
|
||||
arguments, defining the model architecture.
|
||||
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the
|
||||
same defaults. Please check the parent class for more information.
|
||||
|
||||
Examples::
|
||||
|
||||
>>> from transformers import RobertaConfig, RobertaModel
|
||||
|
||||
>>> # Initializing a RoBERTa configuration
|
||||
>>> configuration = RobertaConfig()
|
||||
|
||||
>>> # Initializing a model from the configuration
|
||||
>>> model = RobertaModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "roberta"
|
||||
|
||||
def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
|
||||
"""Constructs RobertaConfig."""
|
||||
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
||||
@@ -0,0 +1,182 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert RoBERTa checkpoint."""
|
||||
|
||||
|
||||
import argparse
|
||||
import pathlib
|
||||
|
||||
import fairseq
|
||||
import torch
|
||||
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
|
||||
from fairseq.modules import TransformerSentenceEncoderLayer
|
||||
from packaging import version
|
||||
|
||||
from transformers.models.bertmodeling_bert import (
|
||||
BertIntermediate,
|
||||
BertLayer,
|
||||
BertOutput,
|
||||
BertSelfAttention,
|
||||
BertSelfOutput,
|
||||
)
|
||||
from transformers.models.roberta.modeling_roberta import (
|
||||
RobertaConfig,
|
||||
RobertaForMaskedLM,
|
||||
RobertaForSequenceClassification,
|
||||
)
|
||||
from transformers.utils import logging
|
||||
|
||||
|
||||
if version.parse(fairseq.__version__) < version.parse("0.9.0"):
|
||||
raise Exception("requires fairseq >= 0.9.0")
|
||||
|
||||
|
||||
logging.set_verbosity_info()
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
SAMPLE_TEXT = "Hello world! cécé herlolip"
|
||||
|
||||
|
||||
def convert_roberta_checkpoint_to_pytorch(
|
||||
roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
|
||||
):
|
||||
"""
|
||||
Copy/paste/tweak roberta's weights to our BERT structure.
|
||||
"""
|
||||
roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
|
||||
roberta.eval() # disable dropout
|
||||
roberta_sent_encoder = roberta.model.encoder.sentence_encoder
|
||||
config = RobertaConfig(
|
||||
vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
|
||||
hidden_size=roberta.args.encoder_embed_dim,
|
||||
num_hidden_layers=roberta.args.encoder_layers,
|
||||
num_attention_heads=roberta.args.encoder_attention_heads,
|
||||
intermediate_size=roberta.args.encoder_ffn_embed_dim,
|
||||
max_position_embeddings=514,
|
||||
type_vocab_size=1,
|
||||
layer_norm_eps=1e-5, # PyTorch default used in fairseq
|
||||
)
|
||||
if classification_head:
|
||||
config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0]
|
||||
print("Our BERT config:", config)
|
||||
|
||||
model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
|
||||
model.eval()
|
||||
|
||||
# Now let's copy all the weights.
|
||||
# Embeddings
|
||||
model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
|
||||
model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
|
||||
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
|
||||
model.roberta.embeddings.token_type_embeddings.weight
|
||||
) # just zero them out b/c RoBERTa doesn't use them.
|
||||
model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
|
||||
model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
|
||||
|
||||
for i in range(config.num_hidden_layers):
|
||||
# Encoder: start of layer
|
||||
layer: BertLayer = model.roberta.encoder.layer[i]
|
||||
roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
|
||||
|
||||
# self attention
|
||||
self_attn: BertSelfAttention = layer.attention.self
|
||||
assert (
|
||||
roberta_layer.self_attn.k_proj.weight.data.shape
|
||||
== roberta_layer.self_attn.q_proj.weight.data.shape
|
||||
== roberta_layer.self_attn.v_proj.weight.data.shape
|
||||
== torch.Size((config.hidden_size, config.hidden_size))
|
||||
)
|
||||
|
||||
self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
|
||||
self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
|
||||
self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
|
||||
self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
|
||||
self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
|
||||
self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
|
||||
|
||||
# self-attention output
|
||||
self_output: BertSelfOutput = layer.attention.output
|
||||
assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
|
||||
self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
|
||||
self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
|
||||
self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
|
||||
self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
|
||||
|
||||
# intermediate
|
||||
intermediate: BertIntermediate = layer.intermediate
|
||||
assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
|
||||
intermediate.dense.weight = roberta_layer.fc1.weight
|
||||
intermediate.dense.bias = roberta_layer.fc1.bias
|
||||
|
||||
# output
|
||||
bert_output: BertOutput = layer.output
|
||||
assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
|
||||
bert_output.dense.weight = roberta_layer.fc2.weight
|
||||
bert_output.dense.bias = roberta_layer.fc2.bias
|
||||
bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
|
||||
bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
|
||||
# end of layer
|
||||
|
||||
if classification_head:
|
||||
model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
|
||||
model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
|
||||
model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
|
||||
model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
|
||||
else:
|
||||
# LM Head
|
||||
model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight
|
||||
model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias
|
||||
model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight
|
||||
model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias
|
||||
model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight
|
||||
model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias
|
||||
|
||||
# Let's check that we get the same results.
|
||||
input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1
|
||||
|
||||
our_output = model(input_ids)[0]
|
||||
if classification_head:
|
||||
their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
|
||||
else:
|
||||
their_output = roberta.model(input_ids)[0]
|
||||
print(our_output.shape, their_output.shape)
|
||||
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
|
||||
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
|
||||
success = torch.allclose(our_output, their_output, atol=1e-3)
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
|
||||
pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
|
||||
print(f"Saving model to {pytorch_dump_folder_path}")
|
||||
model.save_pretrained(pytorch_dump_folder_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--classification_head", action="store_true", help="Whether to convert a final classification head."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_roberta_checkpoint_to_pytorch(
|
||||
args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
|
||||
)
|
||||
432
src/transformers/models/roberta/modeling_flax_roberta.py
Normal file
432
src/transformers/models/roberta/modeling_flax_roberta.py
Normal file
@@ -0,0 +1,432 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google Flax Team Authors and The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Callable, Dict
|
||||
|
||||
import numpy as np
|
||||
|
||||
import flax.linen as nn
|
||||
import jax
|
||||
import jax.numpy as jnp
|
||||
|
||||
from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
|
||||
from ...modeling_flax_utils import FlaxPreTrainedModel, gelu
|
||||
from ...utils import logging
|
||||
from .configuration_roberta import RobertaConfig
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_CONFIG_FOR_DOC = "RobertaConfig"
|
||||
_TOKENIZER_FOR_DOC = "RobertaTokenizer"
|
||||
|
||||
|
||||
ROBERTA_START_DOCSTRING = r"""
|
||||
|
||||
This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
|
||||
generic methods the library implements for all its model (such as downloading, saving and converting weights from
|
||||
PyTorch models)
|
||||
|
||||
This model is also a Flax Linen `flax.nn.Module
|
||||
<https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html>`__ subclass. Use it as a regular Flax
|
||||
Module and refer to the Flax documentation for all matter related to general usage and behavior.
|
||||
|
||||
Finally, this model supports inherent JAX features such as:
|
||||
|
||||
- `Just-In-Time (JIT) compilation <https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit>`__
|
||||
- `Automatic Differentiation <https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation>`__
|
||||
- `Vectorization <https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap>`__
|
||||
- `Parallelization <https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap>`__
|
||||
|
||||
Parameters:
|
||||
config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
|
||||
model. Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
|
||||
weights.
|
||||
"""
|
||||
|
||||
ROBERTA_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using :class:`~transformers.BertTokenizer`. See
|
||||
:func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
|
||||
details.
|
||||
|
||||
`What are input IDs? <../glossary.html#input-ids>`__
|
||||
attention_mask (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
token_type_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
|
||||
1]``:
|
||||
|
||||
- 0 corresponds to a `sentence A` token,
|
||||
- 1 corresponds to a `sentence B` token.
|
||||
|
||||
`What are token type IDs? <../glossary.html#token-type-ids>`__
|
||||
position_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
|
||||
config.max_position_embeddings - 1]``.
|
||||
return_dict (:obj:`bool`, `optional`):
|
||||
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerNorm with Bert->Roberta
|
||||
class FlaxRobertaLayerNorm(nn.Module):
|
||||
"""
|
||||
Layer normalization (https://arxiv.org/abs/1607.06450). Operates on the last axis of the input data.
|
||||
"""
|
||||
|
||||
epsilon: float = 1e-6
|
||||
dtype: jnp.dtype = jnp.float32 # the dtype of the computation
|
||||
bias: bool = True # If True, bias (beta) is added.
|
||||
scale: bool = True # If True, multiply by scale (gamma). When the next layer is linear
|
||||
# (also e.g. nn.relu), this can be disabled since the scaling will be
|
||||
# done by the next layer.
|
||||
bias_init: jnp.ndarray = nn.initializers.zeros
|
||||
scale_init: jnp.ndarray = nn.initializers.ones
|
||||
|
||||
@nn.compact
|
||||
def __call__(self, x):
|
||||
"""
|
||||
Applies layer normalization on the input. It normalizes the activations of the layer for each given example in
|
||||
a batch independently, rather than across a batch like Batch Normalization. i.e. applies a transformation that
|
||||
maintains the mean activation within each example close to 0 and the activation standard deviation close to 1
|
||||
|
||||
Args:
|
||||
x: the inputs
|
||||
|
||||
Returns:
|
||||
Normalized inputs (the same shape as inputs).
|
||||
"""
|
||||
features = x.shape[-1]
|
||||
mean = jnp.mean(x, axis=-1, keepdims=True)
|
||||
mean2 = jnp.mean(jax.lax.square(x), axis=-1, keepdims=True)
|
||||
var = mean2 - jax.lax.square(mean)
|
||||
mul = jax.lax.rsqrt(var + self.epsilon)
|
||||
if self.scale:
|
||||
mul = mul * jnp.asarray(self.param("gamma", self.scale_init, (features,)), self.dtype)
|
||||
y = (x - mean) * mul
|
||||
if self.bias:
|
||||
y = y + jnp.asarray(self.param("beta", self.bias_init, (features,)), self.dtype)
|
||||
return y
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbedding with Bert->Roberta
|
||||
class FlaxRobertaEmbedding(nn.Module):
|
||||
"""
|
||||
Specify a new class for doing the embedding stuff as Flax's one use 'embedding' for the parameter name and PyTorch
|
||||
use 'weight'
|
||||
"""
|
||||
|
||||
vocab_size: int
|
||||
hidden_size: int
|
||||
emb_init: Callable[..., np.ndarray] = nn.initializers.normal(stddev=0.1)
|
||||
|
||||
@nn.compact
|
||||
def __call__(self, inputs):
|
||||
embedding = self.param("weight", self.emb_init, (self.vocab_size, self.hidden_size))
|
||||
return jnp.take(embedding, inputs, axis=0)
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings with Bert->Roberta
|
||||
class FlaxRobertaEmbeddings(nn.Module):
|
||||
"""Construct the embeddings from word, position and token_type embeddings."""
|
||||
|
||||
vocab_size: int
|
||||
hidden_size: int
|
||||
type_vocab_size: int
|
||||
max_length: int
|
||||
|
||||
@nn.compact
|
||||
def __call__(self, input_ids, token_type_ids, position_ids, attention_mask):
|
||||
|
||||
# Embed
|
||||
w_emb = FlaxRobertaEmbedding(self.vocab_size, self.hidden_size, name="word_embeddings")(
|
||||
jnp.atleast_2d(input_ids.astype("i4"))
|
||||
)
|
||||
p_emb = FlaxRobertaEmbedding(self.max_length, self.hidden_size, name="position_embeddings")(
|
||||
jnp.atleast_2d(position_ids.astype("i4"))
|
||||
)
|
||||
t_emb = FlaxRobertaEmbedding(self.type_vocab_size, self.hidden_size, name="token_type_embeddings")(
|
||||
jnp.atleast_2d(token_type_ids.astype("i4"))
|
||||
)
|
||||
|
||||
# Sum all embeddings
|
||||
summed_emb = w_emb + jnp.broadcast_to(p_emb, w_emb.shape) + t_emb
|
||||
|
||||
# Layer Norm
|
||||
layer_norm = FlaxRobertaLayerNorm(name="layer_norm")(summed_emb)
|
||||
|
||||
return layer_norm
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertAttention with Bert->Roberta
|
||||
class FlaxRobertaAttention(nn.Module):
|
||||
num_heads: int
|
||||
head_size: int
|
||||
|
||||
@nn.compact
|
||||
def __call__(self, hidden_state, attention_mask):
|
||||
self_att = nn.attention.SelfAttention(num_heads=self.num_heads, qkv_features=self.head_size, name="self")(
|
||||
hidden_state, attention_mask
|
||||
)
|
||||
|
||||
layer_norm = FlaxRobertaLayerNorm(name="layer_norm")(self_att + hidden_state)
|
||||
return layer_norm
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->Roberta
|
||||
class FlaxRobertaIntermediate(nn.Module):
|
||||
output_size: int
|
||||
|
||||
@nn.compact
|
||||
def __call__(self, hidden_state):
|
||||
# TODO: Add ACT2FN reference to change activation function
|
||||
dense = nn.Dense(features=self.output_size, name="dense")(hidden_state)
|
||||
return gelu(dense)
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->Roberta
|
||||
class FlaxRobertaOutput(nn.Module):
|
||||
@nn.compact
|
||||
def __call__(self, intermediate_output, attention_output):
|
||||
hidden_state = nn.Dense(attention_output.shape[-1], name="dense")(intermediate_output)
|
||||
hidden_state = FlaxRobertaLayerNorm(name="layer_norm")(hidden_state + attention_output)
|
||||
return hidden_state
|
||||
|
||||
|
||||
class FlaxRobertaLayer(nn.Module):
|
||||
num_heads: int
|
||||
head_size: int
|
||||
intermediate_size: int
|
||||
|
||||
@nn.compact
|
||||
def __call__(self, hidden_state, attention_mask):
|
||||
attention = FlaxRobertaAttention(self.num_heads, self.head_size, name="attention")(
|
||||
hidden_state, attention_mask
|
||||
)
|
||||
intermediate = FlaxRobertaIntermediate(self.intermediate_size, name="intermediate")(attention)
|
||||
output = FlaxRobertaOutput(name="output")(intermediate, attention)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection with Bert->Roberta
|
||||
class FlaxRobertaLayerCollection(nn.Module):
|
||||
"""
|
||||
Stores N RobertaLayer(s)
|
||||
"""
|
||||
|
||||
num_layers: int
|
||||
num_heads: int
|
||||
head_size: int
|
||||
intermediate_size: int
|
||||
|
||||
@nn.compact
|
||||
def __call__(self, inputs, attention_mask):
|
||||
assert self.num_layers > 0, f"num_layers should be >= 1, got ({self.num_layers})"
|
||||
|
||||
# Initialize input / output
|
||||
input_i = inputs
|
||||
|
||||
# Forward over all encoders
|
||||
for i in range(self.num_layers):
|
||||
layer = FlaxRobertaLayer(self.num_heads, self.head_size, self.intermediate_size, name=f"{i}")
|
||||
input_i = layer(input_i, attention_mask)
|
||||
return input_i
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->Roberta
|
||||
class FlaxRobertaEncoder(nn.Module):
|
||||
num_layers: int
|
||||
num_heads: int
|
||||
head_size: int
|
||||
intermediate_size: int
|
||||
|
||||
@nn.compact
|
||||
def __call__(self, hidden_state, attention_mask):
|
||||
layer = FlaxRobertaLayerCollection(
|
||||
self.num_layers, self.num_heads, self.head_size, self.intermediate_size, name="layer"
|
||||
)(hidden_state, attention_mask)
|
||||
return layer
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPooler with Bert->Roberta
|
||||
class FlaxRobertaPooler(nn.Module):
|
||||
@nn.compact
|
||||
def __call__(self, hidden_state):
|
||||
cls_token = hidden_state[:, 0]
|
||||
out = nn.Dense(hidden_state.shape[-1], name="dense")(cls_token)
|
||||
return jax.lax.tanh(out)
|
||||
|
||||
|
||||
# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertModule with Bert->Roberta
|
||||
class FlaxRobertaModule(nn.Module):
|
||||
vocab_size: int
|
||||
hidden_size: int
|
||||
type_vocab_size: int
|
||||
max_length: int
|
||||
num_encoder_layers: int
|
||||
num_heads: int
|
||||
head_size: int
|
||||
intermediate_size: int
|
||||
|
||||
@nn.compact
|
||||
def __call__(self, input_ids, attention_mask, token_type_ids, position_ids):
|
||||
|
||||
# Embedding
|
||||
embeddings = FlaxRobertaEmbeddings(
|
||||
self.vocab_size, self.hidden_size, self.type_vocab_size, self.max_length, name="embeddings"
|
||||
)(input_ids, token_type_ids, position_ids, attention_mask)
|
||||
|
||||
# N stacked encoding layers
|
||||
encoder = FlaxRobertaEncoder(
|
||||
self.num_encoder_layers, self.num_heads, self.head_size, self.intermediate_size, name="encoder"
|
||||
)(embeddings, attention_mask)
|
||||
|
||||
pooled = FlaxRobertaPooler(name="pooler")(encoder)
|
||||
return encoder, pooled
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
ROBERTA_START_DOCSTRING,
|
||||
)
|
||||
class FlaxRobertaModel(FlaxPreTrainedModel):
|
||||
"""
|
||||
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
|
||||
cross-attention is added between the self-attention layers, following the architecture described in `Attention is
|
||||
all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
|
||||
Kaiser and Illia Polosukhin.
|
||||
"""
|
||||
|
||||
model_class = FlaxRobertaModule
|
||||
config_class = RobertaConfig
|
||||
base_model_prefix = "roberta"
|
||||
|
||||
@staticmethod
|
||||
def convert_from_pytorch(pt_state: Dict, config: RobertaConfig) -> Dict:
|
||||
jax_state = dict(pt_state)
|
||||
|
||||
# Need to change some parameters name to match Flax names so that we don't have to fork any layer
|
||||
for key, tensor in pt_state.items():
|
||||
# Key parts
|
||||
key_parts = set(key.split("."))
|
||||
|
||||
# Every dense layer has "kernel" parameters instead of "weight"
|
||||
if "dense.weight" in key:
|
||||
del jax_state[key]
|
||||
key = key.replace("weight", "kernel")
|
||||
jax_state[key] = tensor
|
||||
|
||||
# SelfAttention needs also to replace "weight" by "kernel"
|
||||
if {"query", "key", "value"} & key_parts:
|
||||
|
||||
# Flax SelfAttention decomposes the heads (num_head, size // num_heads)
|
||||
if "bias" in key:
|
||||
jax_state[key] = tensor.reshape((config.num_attention_heads, -1))
|
||||
elif "weight":
|
||||
del jax_state[key]
|
||||
key = key.replace("weight", "kernel")
|
||||
tensor = tensor.reshape((config.num_attention_heads, -1, config.hidden_size)).transpose((2, 0, 1))
|
||||
jax_state[key] = tensor
|
||||
|
||||
# SelfAttention output is not a separate layer, remove one nesting
|
||||
if "attention.output.dense" in key:
|
||||
del jax_state[key]
|
||||
key = key.replace("attention.output.dense", "attention.self.out")
|
||||
jax_state[key] = tensor
|
||||
|
||||
# SelfAttention output is not a separate layer, remove nesting on layer norm
|
||||
if "attention.output.LayerNorm" in key:
|
||||
del jax_state[key]
|
||||
key = key.replace("attention.output.LayerNorm", "attention.LayerNorm")
|
||||
jax_state[key] = tensor
|
||||
|
||||
# There are some transposed parameters w.r.t their PyTorch counterpart
|
||||
if "intermediate.dense.kernel" in key or "output.dense.kernel" in key:
|
||||
jax_state[key] = tensor.T
|
||||
|
||||
# Self Attention output projection needs to be transposed
|
||||
if "out.kernel" in key:
|
||||
jax_state[key] = tensor.reshape((config.hidden_size, config.num_attention_heads, -1)).transpose(
|
||||
1, 2, 0
|
||||
)
|
||||
|
||||
# Pooler needs to transpose its kernel
|
||||
if "pooler.dense.kernel" in key:
|
||||
jax_state[key] = tensor.T
|
||||
|
||||
# Handle LayerNorm conversion
|
||||
if "LayerNorm" in key:
|
||||
del jax_state[key]
|
||||
|
||||
# Replace LayerNorm by layer_norm
|
||||
new_key = key.replace("LayerNorm", "layer_norm")
|
||||
|
||||
if "weight" in key:
|
||||
new_key = new_key.replace("weight", "gamma")
|
||||
elif "bias" in key:
|
||||
new_key = new_key.replace("bias", "beta")
|
||||
|
||||
jax_state[new_key] = tensor
|
||||
|
||||
return jax_state
|
||||
|
||||
def __init__(self, config: RobertaConfig, state: dict, seed: int = 0, **kwargs):
|
||||
model = FlaxRobertaModule(
|
||||
vocab_size=config.vocab_size,
|
||||
hidden_size=config.hidden_size,
|
||||
type_vocab_size=config.type_vocab_size,
|
||||
max_length=config.max_position_embeddings,
|
||||
num_encoder_layers=config.num_hidden_layers,
|
||||
num_heads=config.num_attention_heads,
|
||||
head_size=config.hidden_size,
|
||||
intermediate_size=config.intermediate_size,
|
||||
)
|
||||
|
||||
super().__init__(config, model, state, seed)
|
||||
|
||||
@property
|
||||
def module(self) -> nn.Module:
|
||||
return self._module
|
||||
|
||||
@add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
||||
def __call__(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None):
|
||||
if token_type_ids is None:
|
||||
token_type_ids = jnp.ones_like(input_ids)
|
||||
|
||||
if position_ids is None:
|
||||
position_ids = np.arange(
|
||||
self.config.pad_token_id + 1, np.atleast_2d(input_ids).shape[-1] + self.config.pad_token_id + 1
|
||||
)
|
||||
|
||||
if attention_mask is None:
|
||||
attention_mask = jnp.ones_like(input_ids)
|
||||
|
||||
return self.model.apply(
|
||||
{"params": self.params},
|
||||
jnp.array(input_ids, dtype="i4"),
|
||||
jnp.array(attention_mask, dtype="i4"),
|
||||
jnp.array(token_type_ids, dtype="i4"),
|
||||
jnp.array(position_ids, dtype="i4"),
|
||||
)
|
||||
1352
src/transformers/models/roberta/modeling_roberta.py
Normal file
1352
src/transformers/models/roberta/modeling_roberta.py
Normal file
File diff suppressed because it is too large
Load Diff
1263
src/transformers/models/roberta/modeling_tf_roberta.py
Normal file
1263
src/transformers/models/roberta/modeling_tf_roberta.py
Normal file
File diff suppressed because it is too large
Load Diff
264
src/transformers/models/roberta/tokenization_roberta.py
Normal file
264
src/transformers/models/roberta/tokenization_roberta.py
Normal file
@@ -0,0 +1,264 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for RoBERTa."""
|
||||
|
||||
import warnings
|
||||
from typing import List, Optional
|
||||
|
||||
from ...tokenization_utils import AddedToken
|
||||
from ...utils import logging
|
||||
from ..gpt2.tokenization_gpt2 import GPT2Tokenizer
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
VOCAB_FILES_NAMES = {
|
||||
"vocab_file": "vocab.json",
|
||||
"merges_file": "merges.txt",
|
||||
}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
|
||||
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
|
||||
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
|
||||
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
|
||||
"roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
|
||||
"roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
|
||||
},
|
||||
"merges_file": {
|
||||
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
|
||||
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
|
||||
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
|
||||
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
|
||||
"roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
|
||||
"roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
|
||||
},
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
"roberta-base": 512,
|
||||
"roberta-large": 512,
|
||||
"roberta-large-mnli": 512,
|
||||
"distilroberta-base": 512,
|
||||
"roberta-base-openai-detector": 512,
|
||||
"roberta-large-openai-detector": 512,
|
||||
}
|
||||
|
||||
|
||||
class RobertaTokenizer(GPT2Tokenizer):
|
||||
"""
|
||||
Constructs a RoBERTa tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
|
||||
|
||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||
|
||||
::
|
||||
|
||||
>>> from transformers import RobertaTokenizer
|
||||
>>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
|
||||
>>> tokenizer("Hello world")['input_ids']
|
||||
[0, 31414, 232, 328, 2]
|
||||
>>> tokenizer(" Hello world")['input_ids']
|
||||
[0, 20920, 232, 2]
|
||||
|
||||
You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
|
||||
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
|
||||
|
||||
.. note::
|
||||
|
||||
When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
|
||||
one).
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||
methods. Users should refer to this superclass for more information regarding those methods.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
Path to the vocabulary file.
|
||||
merges_file (:obj:`str`):
|
||||
Path to the merges file.
|
||||
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
|
||||
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
|
||||
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
|
||||
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
|
||||
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||
|
||||
.. note::
|
||||
|
||||
When building a sequence using special tokens, this is not the token that is used for the beginning of
|
||||
sequence. The token used is the :obj:`cls_token`.
|
||||
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
||||
The end of sequence token.
|
||||
|
||||
.. note::
|
||||
|
||||
When building a sequence using special tokens, this is not the token that is used for the end of
|
||||
sequence. The token used is the :obj:`sep_token`.
|
||||
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
||||
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
|
||||
sequence classification or for a text and a question for question answering. It is also used as the last
|
||||
token of a sequence built with special tokens.
|
||||
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
|
||||
The classifier token which is used when doing sequence classification (classification of the whole sequence
|
||||
instead of per-token classification). It is the first token of the sequence when built with special tokens.
|
||||
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
|
||||
The token used for masking values. This is the token used when training this model with masked language
|
||||
modeling. This is the token which the model will try to predict.
|
||||
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
|
||||
other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["attention_mask"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
merges_file,
|
||||
errors="replace",
|
||||
bos_token="<s>",
|
||||
eos_token="</s>",
|
||||
sep_token="</s>",
|
||||
cls_token="<s>",
|
||||
unk_token="<unk>",
|
||||
pad_token="<pad>",
|
||||
mask_token="<mask>",
|
||||
add_prefix_space=False,
|
||||
**kwargs
|
||||
):
|
||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
|
||||
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
|
||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
||||
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
merges_file=merges_file,
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def build_inputs_with_special_tokens(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
||||
adding special tokens. A RoBERTa sequence has the following format:
|
||||
|
||||
- single sequence: ``<s> X </s>``
|
||||
- pair of sequences: ``<s> A </s></s> B </s>``
|
||||
|
||||
Args:
|
||||
token_ids_0 (:obj:`List[int]`):
|
||||
List of IDs to which the special tokens will be added.
|
||||
token_ids_1 (:obj:`List[int]`, `optional`):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
||||
"""
|
||||
if token_ids_1 is None:
|
||||
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
sep = [self.sep_token_id]
|
||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||
|
||||
def get_special_tokens_mask(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||
) -> List[int]:
|
||||
"""
|
||||
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer ``prepare_for_model`` method.
|
||||
|
||||
Args:
|
||||
token_ids_0 (:obj:`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (:obj:`List[int]`, `optional`):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the token list is already formatted with special tokens for the model.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
||||
"""
|
||||
if already_has_special_tokens:
|
||||
if token_ids_1 is not None:
|
||||
raise ValueError(
|
||||
"You should not supply a second sequence if the provided sequence of "
|
||||
"ids is already formatted with special tokens for the model."
|
||||
)
|
||||
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
||||
|
||||
if token_ids_1 is None:
|
||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
||||
|
||||
def create_token_type_ids_from_sequences(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
|
||||
make use of token type ids, therefore a list of zeros is returned.
|
||||
|
||||
Args:
|
||||
token_ids_0 (:obj:`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (:obj:`List[int]`, `optional`):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: List of zeros.
|
||||
"""
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
|
||||
if token_ids_1 is None:
|
||||
return len(cls + token_ids_0 + sep) * [0]
|
||||
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
|
||||
|
||||
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
|
||||
if "is_pretokenized" in kwargs:
|
||||
warnings.warn(
|
||||
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
|
||||
FutureWarning,
|
||||
)
|
||||
is_split_into_words = kwargs.pop("is_pretokenized")
|
||||
|
||||
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
|
||||
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
|
||||
text = " " + text
|
||||
return (text, kwargs)
|
||||
230
src/transformers/models/roberta/tokenization_roberta_fast.py
Normal file
230
src/transformers/models/roberta/tokenization_roberta_fast.py
Normal file
@@ -0,0 +1,230 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Fast Tokenization classes for RoBERTa."""
|
||||
|
||||
from typing import List, Optional
|
||||
|
||||
from ...tokenization_utils_base import AddedToken
|
||||
from ...utils import logging
|
||||
from ..gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
|
||||
from .tokenization_roberta import RobertaTokenizer
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
|
||||
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
|
||||
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
|
||||
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
|
||||
"roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
|
||||
"roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
|
||||
},
|
||||
"merges_file": {
|
||||
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
|
||||
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
|
||||
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
|
||||
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
|
||||
"roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
|
||||
"roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
|
||||
},
|
||||
"tokenizer_file": {
|
||||
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json",
|
||||
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json",
|
||||
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/tokenizer.json",
|
||||
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/tokenizer.json",
|
||||
"roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json",
|
||||
"roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json",
|
||||
},
|
||||
}
|
||||
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
"roberta-base": 512,
|
||||
"roberta-large": 512,
|
||||
"roberta-large-mnli": 512,
|
||||
"distilroberta-base": 512,
|
||||
"roberta-base-openai-detector": 512,
|
||||
"roberta-large-openai-detector": 512,
|
||||
}
|
||||
|
||||
|
||||
class RobertaTokenizerFast(GPT2TokenizerFast):
|
||||
"""
|
||||
Construct a "fast" RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2
|
||||
tokenizer, using byte-level Byte-Pair-Encoding.
|
||||
|
||||
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
|
||||
be encoded differently whether it is at the beginning of the sentence (without space) or not:
|
||||
|
||||
::
|
||||
|
||||
>>> from transformers import RobertaTokenizerFast
|
||||
>>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
|
||||
>>> tokenizer("Hello world")['input_ids']
|
||||
[0, 31414, 232, 328, 2]
|
||||
>>> tokenizer(" Hello world")['input_ids']
|
||||
[0, 20920, 232, 2]
|
||||
|
||||
You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
|
||||
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
|
||||
|
||||
.. note::
|
||||
|
||||
When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
|
||||
``add_prefix_space=True``.
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
|
||||
methods. Users should refer to this superclass for more information regarding those methods.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
Path to the vocabulary file.
|
||||
merges_file (:obj:`str`):
|
||||
Path to the merges file.
|
||||
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
|
||||
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
|
||||
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
|
||||
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
|
||||
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||
|
||||
.. note::
|
||||
|
||||
When building a sequence using special tokens, this is not the token that is used for the beginning of
|
||||
sequence. The token used is the :obj:`cls_token`.
|
||||
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
||||
The end of sequence token.
|
||||
|
||||
.. note::
|
||||
|
||||
When building a sequence using special tokens, this is not the token that is used for the end of
|
||||
sequence. The token used is the :obj:`sep_token`.
|
||||
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
||||
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
|
||||
sequence classification or for a text and a question for question answering. It is also used as the last
|
||||
token of a sequence built with special tokens.
|
||||
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
|
||||
The classifier token which is used when doing sequence classification (classification of the whole sequence
|
||||
instead of per-token classification). It is the first token of the sequence when built with special tokens.
|
||||
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
|
||||
The token used for masking values. This is the token used when training this model with masked language
|
||||
modeling. This is the token which the model will try to predict.
|
||||
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
|
||||
other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
|
||||
trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether the post processing step should trim offsets to avoid including whitespaces.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["attention_mask"]
|
||||
slow_tokenizer_class = RobertaTokenizer
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
merges_file,
|
||||
tokenizer_file=None,
|
||||
errors="replace",
|
||||
bos_token="<s>",
|
||||
eos_token="</s>",
|
||||
sep_token="</s>",
|
||||
cls_token="<s>",
|
||||
unk_token="<unk>",
|
||||
pad_token="<pad>",
|
||||
mask_token="<mask>",
|
||||
add_prefix_space=False,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
errors=errors,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
sep_token=sep_token,
|
||||
cls_token=cls_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
mask_token=mask_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@property
|
||||
def mask_token(self) -> str:
|
||||
"""
|
||||
:obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
|
||||
not having been set.
|
||||
|
||||
Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily
|
||||
comprise the space before the `<mask>`.
|
||||
"""
|
||||
if self._mask_token is None and self.verbose:
|
||||
logger.error("Using mask_token, but it is not set yet.")
|
||||
return None
|
||||
return str(self._mask_token)
|
||||
|
||||
@mask_token.setter
|
||||
def mask_token(self, value):
|
||||
"""
|
||||
Overriding the default behavior of the mask token to have it eat the space before it.
|
||||
|
||||
This is needed to preserve backward compatibility with all the previously used models based on Roberta.
|
||||
"""
|
||||
# Mask token behave like a normal word, i.e. include the space before it
|
||||
# So we set lstrip to True
|
||||
value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
|
||||
self._mask_token = value
|
||||
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
|
||||
if token_ids_1 is None:
|
||||
return output
|
||||
|
||||
return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
|
||||
|
||||
def create_token_type_ids_from_sequences(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
|
||||
make use of token type ids, therefore a list of zeros is returned.
|
||||
|
||||
Args:
|
||||
token_ids_0 (:obj:`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (:obj:`List[int]`, `optional`):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: List of zeros.
|
||||
"""
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
|
||||
if token_ids_1 is None:
|
||||
return len(cls + token_ids_0 + sep) * [0]
|
||||
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
|
||||
Reference in New Issue
Block a user