Reformer (#3351)
* first copy & past commit from Bert and morgans LSH code * add easy way to compare to trax original code * translate most of function * make trax lsh self attention deterministic with numpy seed + copy paste code * add same config * add same config * make layer init work * implemented hash_vectors function for lsh attention * continue reformer translation * hf LSHSelfAttentionLayer gives same output as trax layer * refactor code * refactor code * refactor code * refactor * refactor + add reformer config * delete bogus file * split reformer attention layer into two layers * save intermediate step * save intermediate step * make test work * add complete reformer block layer * finish reformer layer * implement causal and self mask * clean reformer test and refactor code * fix merge conflicts * fix merge conflicts * update init * fix device for GPU * fix chunk length init for tests * include morgans optimization * improve memory a bit * improve comment * factorize num_buckets * better testing parameters * make whole model work * make lm model work * add t5 copy paste tokenizer * add chunking feed forward * clean config * add improved assert statements * make tokenizer work * improve test * correct typo * extend config * add complexer test * add new axial position embeddings * add local block attention layer * clean tests * refactor * better testing * save intermediate progress * clean test file * make shorter input length work for model * allow variable input length * refactor * make forward pass for pretrained model work * add generation possibility * finish dropout and init * make style * refactor * add first version of RevNet Layers * make forward pass work and add convert file * make uploaded model forward pass work * make uploaded model forward pass work * refactor code * add namedtuples and cache buckets * correct head masks * refactor * made reformer more flexible * make style * remove set max length * add attention masks * fix up tests * fix lsh attention mask * make random seed optional for the moment * improve memory in reformer * add tests * make style * make sure masks work correctly * detach gradients * save intermediate * correct backprob through gather * make style * change back num hashes * rename to labels * fix rotation shape * fix detach * update * fix trainer * fix backward dropout * make reformer more flexible * fix conflict * fix * fix * add tests for fixed seed in reformer layer * fix trainer typo * fix typo in activations * add fp16 tests * add fp16 training * support fp16 * correct gradient bug in reformer * add fast gelu * re-add dropout for embedding dropout * better naming * better naming * renaming * finalize test branch * finalize tests * add more tests * finish tests * fix * fix type trainer * fix fp16 tests * fix tests * fix tests * fix tests * fix issue with dropout * fix dropout seeds * correct random seed on gpu * finalize random seed for dropout * finalize random seed for dropout * remove duplicate line * correct half precision bug * make style * refactor * refactor * docstring * remove sinusoidal position encodings for reformer * move chunking to modeling_utils * make style * clean config * make style * fix tests * fix auto tests * pretrained models * fix docstring * update conversion file * Update pretrained_models.rst * fix rst * fix rst * update copyright * fix test path * fix test path * fix small issue in test * include reformer in generation tests * add docs for axial position encoding * finish docs * Update convert_reformer_trax_checkpoint_to_pytorch.py * remove isort * include sams comments * remove wrong comment in utils * correct typos * fix typo * Update reformer.rst * applied morgans optimization * make style * make gpu compatible * remove bogus file * big test refactor * add example for chunking * fix typo * add to README
This commit is contained in:
committed by
GitHub
parent
877fc56410
commit
dca34695d0
@@ -47,6 +47,7 @@ from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
|
||||
from .configuration_marian import MarianConfig
|
||||
from .configuration_mmbt import MMBTConfig
|
||||
from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
|
||||
from .configuration_reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig
|
||||
from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
|
||||
from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
|
||||
from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
|
||||
@@ -138,6 +139,7 @@ from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
|
||||
from .tokenization_flaubert import FlaubertTokenizer
|
||||
from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
|
||||
from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
|
||||
from .tokenization_reformer import ReformerTokenizer
|
||||
from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
|
||||
from .tokenization_t5 import T5Tokenizer
|
||||
from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer, TransfoXLTokenizerFast
|
||||
@@ -159,7 +161,7 @@ if is_sklearn_available():
|
||||
|
||||
# Modeling
|
||||
if is_torch_available():
|
||||
from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, top_k_top_p_filtering
|
||||
from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, top_k_top_p_filtering, apply_chunking_to_forward
|
||||
from .modeling_auto import (
|
||||
AutoModel,
|
||||
AutoModelForPreTraining,
|
||||
@@ -190,6 +192,7 @@ if is_torch_available():
|
||||
BertForQuestionAnswering,
|
||||
load_tf_weights_in_bert,
|
||||
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
BertLayer,
|
||||
)
|
||||
from .modeling_openai import (
|
||||
OpenAIGPTPreTrainedModel,
|
||||
@@ -320,6 +323,14 @@ if is_torch_available():
|
||||
ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
from .modeling_reformer import (
|
||||
ReformerAttention,
|
||||
ReformerLayer,
|
||||
ReformerModel,
|
||||
ReformerModelWithLMHead,
|
||||
REFORMER_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
)
|
||||
|
||||
# Optimization
|
||||
from .optimization import (
|
||||
AdamW,
|
||||
|
||||
@@ -34,12 +34,18 @@ if torch.__version__ < "1.4.0":
|
||||
else:
|
||||
gelu = F.gelu
|
||||
|
||||
|
||||
def gelu_fast(x):
|
||||
return 0.5 * x * (1 + torch.tanh(x * 0.7978845608 * (1 + 0.044715 * x * x)))
|
||||
|
||||
|
||||
ACT2FN = {
|
||||
"relu": F.relu,
|
||||
"swish": swish,
|
||||
"gelu": gelu,
|
||||
"tanh": torch.tanh,
|
||||
"gelu_new": gelu_new,
|
||||
"gelu_fast": gelu_fast,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@ from .configuration_encoder_decoder import EncoderDecoderConfig
|
||||
from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
|
||||
from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
|
||||
from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
|
||||
from .configuration_reformer import ReformerConfig
|
||||
from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
|
||||
from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
|
||||
from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TransfoXLConfig
|
||||
@@ -73,6 +74,7 @@ CONFIG_MAPPING = OrderedDict(
|
||||
("camembert", CamembertConfig,),
|
||||
("xlm-roberta", XLMRobertaConfig,),
|
||||
("bart", BartConfig,),
|
||||
("reformer", ReformerConfig,),
|
||||
("roberta", RobertaConfig,),
|
||||
("flaubert", FlaubertConfig,),
|
||||
("bert", BertConfig,),
|
||||
@@ -130,6 +132,7 @@ class AutoConfig:
|
||||
- contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model)
|
||||
- contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model)
|
||||
- contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model)
|
||||
- contains `reformer`: :class:`~transformers.ReformerConfig` (Reformer model)
|
||||
- contains `bert`: :class:`~transformers.BertConfig` (Bert model)
|
||||
- contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model)
|
||||
- contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model)
|
||||
|
||||
210
src/transformers/configuration_reformer.py
Normal file
210
src/transformers/configuration_reformer.py
Normal file
@@ -0,0 +1,210 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Trax Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Reformer model configuration """
|
||||
|
||||
|
||||
import logging
|
||||
|
||||
from .configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/config.json"
|
||||
}
|
||||
|
||||
|
||||
class ReformerConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`.
|
||||
It is used to instantiate an Reformer model according to the specified arguments, defining the model
|
||||
architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
|
||||
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
|
||||
for more information.
|
||||
|
||||
Args:
|
||||
attention_head_size (:obj:`int`, optional, defaults to 64):
|
||||
Dimensionality of the projected key, query and value vectors
|
||||
attn_layers (:obj:`list(str)`, optional, defaults to ["local", "lsh", "local", "lsh", "local", "lsh"]):
|
||||
List of attention layer types in ascending order. It can be chosen between a
|
||||
LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
|
||||
For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
|
||||
For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
|
||||
axial_pos_embds (:obj:`bool`, optional, defaults to True):
|
||||
If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
|
||||
axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
|
||||
The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
|
||||
axial_pos_shape (:obj:`list(int)`, optional, defaults to `[64, 64]`):
|
||||
The position dims of the axial position encodings.
|
||||
During training the product of the position dims has to equal the sequence length.
|
||||
For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__ncodings.
|
||||
axial_pos_embds_dim (:obj:`list(int)`, optional, defaults to `[64, 192]`):
|
||||
The embedding dims of the axial position encodings.
|
||||
The sum of the embedding dims has to equal the hidden size.
|
||||
For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__ncodings.
|
||||
chunk_size_lm_head (:obj:`int`, optional, defaults to 0):
|
||||
The chunk size of the final language model feed forward head layer.
|
||||
A chunk size of 0 means that the feed forward layer is not chunked.
|
||||
A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
|
||||
For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
|
||||
chunk_size_feed_forward (:obj:`int`, optional, defaults to 0):
|
||||
The chunk size of all feed forward layers in the residual attention blocks.
|
||||
A chunk size of 0 means that the feed forward layer is not chunked.
|
||||
A chunk size of n means that the feed forward layer processes n < sequence_length embeddings at a time.
|
||||
For more information on feed forward chunking, see `How does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
|
||||
eos_token_id (:obj:`int`, optional, defaults to 2):
|
||||
The token id for the <EOS> token.
|
||||
feed_forward_size (:obj:`int`, optional, defaults to 512):
|
||||
Dimensionality of the "feed_forward" (i.e., feed-forward) layer in the residual attention block.
|
||||
hash_seed (:obj:`int`, optional, defaults to `None`):
|
||||
Seed that can be used to make local sensitive hashing in LSHSelfAttention deterministic. This should only be set for testing purposed. For evaluation and training purposes `hash_seed` should be set to `None` to ensure fully random rotations in local sensitive hashing scheme.
|
||||
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "relu"):
|
||||
The non-linear activation function (function or string) in the feed forward layer in the residual attention block.
|
||||
If string, "gelu", "relu", "swish", "gelu_new" and "gelu_fast" are supported.
|
||||
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.05):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
hidden_size (:obj:`int`, optional, defaults to 256):
|
||||
Dimensionality of the output hidden states of the residual attention blocks.
|
||||
initializer_range (:obj:`float`, optional, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
is_decoder (:obj:`bool`, optional, defaults to False):
|
||||
If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
|
||||
When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
|
||||
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
local_chunk_length (:obj:`int`, optional, defaults to 64):
|
||||
Length of chunk which attends to itself in LocalSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
|
||||
local_num_chunks_before (:obj:`int`, optional, defaults to 1):
|
||||
Number of previous neighbouring chunks to attend to in LocalSelfAttention layer to itself.
|
||||
local_num_chunks_after (:obj:`int`, optional, defaults to 0):
|
||||
Number of following neighbouring chunks to attend to in LocalSelfAttention layer in addition to itself.
|
||||
local_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities in LocalSelfAttention.
|
||||
lsh_chunk_length (:obj:`int`, optional, defaults to 64):
|
||||
Length of chunk which attends to itself in LSHSelfAttention. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).
|
||||
lsh_num_chunks_before (:obj:`int`, optional, defaults to 1):
|
||||
Number of previous neighbouring chunks to attend to in LSHSelfAttention layer to itself.
|
||||
lsh_num_chunks_after (:obj:`int`, optional, defaults to 0):
|
||||
Number of following neighbouring chunks to attend to in LSHSelfAttention layer to itself.
|
||||
lsh_attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities in LSHSelfAttention.
|
||||
max_position_embeddings (:obj:`int`, optional, defaults to 4096):
|
||||
The maximum sequence length that this model might ever be used with.
|
||||
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
||||
num_attention_heads (:obj:`int`, optional, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_buckets (:obj:`int` or :obj:`list(int)`, optional, defaults to `64`):
|
||||
Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme. Each query key vector is hashed into a hash in `1, ..., num_buckets`.
|
||||
The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors.
|
||||
The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length.
|
||||
num_hashes (:obj:`int`, optional, defaults to 1):
|
||||
Number of hashing rounds (e.g. number of random rotations) in Local Sensitive Hashing scheme.
|
||||
The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes.
|
||||
pad_token_id (:obj:`int`, optional, defaults to 0):
|
||||
The token id for the <PAD> token.
|
||||
vocab_size (:obj:`int`, optional, defaults to 320):
|
||||
Vocabulary size of the Reformer model. Defines the different tokens that
|
||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ReformerModel`.
|
||||
|
||||
Example::
|
||||
|
||||
from transformers import ReformerModel, ReformerConfig
|
||||
|
||||
# Initializing a Reformer configuration
|
||||
configuration = ReformerConfig()
|
||||
|
||||
# Initializing a Reformer model
|
||||
model = ReformerModel(configuration)
|
||||
|
||||
# Accessing the model configuration
|
||||
configuration = model.config
|
||||
|
||||
Attributes:
|
||||
pretrained_config_archive_map (Dict[str, str]):
|
||||
A dictionary containing all the available pre-trained checkpoints.
|
||||
"""
|
||||
pretrained_config_archive_map = REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
model_type = "reformer"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
attention_head_size=64,
|
||||
attn_layers=["local", "lsh", "local", "lsh", "local", "lsh"],
|
||||
axial_norm_std=1.0,
|
||||
axial_pos_embds=True,
|
||||
axial_pos_shape=[64, 64],
|
||||
axial_pos_embds_dim=[64, 192],
|
||||
chunk_size_lm_head=0,
|
||||
chunk_size_feed_forward=0,
|
||||
eos_token_id=2,
|
||||
feed_forward_size=512,
|
||||
hash_seed=None,
|
||||
hidden_act="relu",
|
||||
hidden_dropout_prob=0.05,
|
||||
hidden_size=256,
|
||||
initializer_range=0.02,
|
||||
is_decoder=False,
|
||||
layer_norm_eps=1e-12,
|
||||
local_num_chunks_before=1,
|
||||
local_num_chunks_after=0,
|
||||
local_attention_probs_dropout_prob=0.05,
|
||||
local_attn_chunk_length=64,
|
||||
lsh_attn_chunk_length=64,
|
||||
lsh_attention_probs_dropout_prob=0.0,
|
||||
lsh_num_chunks_before=1,
|
||||
lsh_num_chunks_after=0,
|
||||
max_position_embeddings=4096,
|
||||
num_attention_heads=2,
|
||||
num_buckets=32,
|
||||
num_hashes=1,
|
||||
pad_token_id=0,
|
||||
vocab_size=320,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(pad_token_id=pad_token_id, eos_token_id=eos_token_id, is_decoder=is_decoder, **kwargs)
|
||||
|
||||
self.hash_seed = hash_seed
|
||||
self.vocab_size = vocab_size
|
||||
self.attention_head_size = attention_head_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_hashes = num_hashes
|
||||
self.num_hidden_layers = len(attn_layers)
|
||||
self.num_buckets = tuple(num_buckets) if isinstance(num_buckets, list) else num_buckets
|
||||
self.lsh_attn_chunk_length = lsh_attn_chunk_length
|
||||
self.local_attn_chunk_length = local_attn_chunk_length
|
||||
self.lsh_num_chunks_after = lsh_num_chunks_after
|
||||
self.lsh_num_chunks_before = lsh_num_chunks_before
|
||||
self.local_num_chunks_after = local_num_chunks_after
|
||||
self.local_num_chunks_before = local_num_chunks_before
|
||||
self.hidden_act = hidden_act
|
||||
self.feed_forward_size = feed_forward_size
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.lsh_attention_probs_dropout_prob = lsh_attention_probs_dropout_prob
|
||||
self.local_attention_probs_dropout_prob = local_attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.axial_pos_embds = axial_pos_embds
|
||||
self.axial_pos_shape = tuple(axial_pos_shape)
|
||||
self.axial_pos_embds_dim = tuple(axial_pos_embds_dim)
|
||||
self.axial_norm_std = axial_norm_std
|
||||
self.chunk_size_lm_head = chunk_size_lm_head
|
||||
self.chunk_size_feed_forward = chunk_size_feed_forward
|
||||
self.attn_layers = attn_layers
|
||||
211
src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
Executable file
211
src/transformers/convert_reformer_trax_checkpoint_to_pytorch.py
Executable file
@@ -0,0 +1,211 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Convert Reformer checkpoint."""
|
||||
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from transformers import ReformerConfig, ReformerModelWithLMHead
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
|
||||
def set_param(torch_layer, weight, bias=None):
|
||||
# set parameter of one layer
|
||||
assert torch_layer.weight.shape == weight.shape, "{} layer.weight does not match".format(torch_layer)
|
||||
torch_layer.weight = torch.nn.Parameter(weight)
|
||||
if bias is not None:
|
||||
assert torch_layer.bias.shape == bias.shape, "{} layer.bias does not match".format(torch_layer)
|
||||
torch_layer.bias = torch.nn.Parameter(bias)
|
||||
|
||||
|
||||
def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size):
|
||||
# set torch weights for 1-to-1 comparison
|
||||
np_query_key = np.asarray(weights[0])
|
||||
np_value = np.asarray(weights[1])
|
||||
np_dense = np.asarray(weights[2])
|
||||
|
||||
set_param(
|
||||
torch_layer.self_attention.query_key,
|
||||
torch.tensor(np_query_key).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
)
|
||||
set_param(
|
||||
torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
)
|
||||
set_param(
|
||||
torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
|
||||
)
|
||||
|
||||
|
||||
def set_layer_weights_in_torch_local(weights, torch_layer, hidden_size):
|
||||
# set torch weights for 1-to-1 comparison
|
||||
np_query = np.asarray(weights[0])
|
||||
np_key = np.asarray(weights[1])
|
||||
np_value = np.asarray(weights[2])
|
||||
np_dense = np.asarray(weights[3])
|
||||
|
||||
set_param(
|
||||
torch_layer.self_attention.query, torch.tensor(np_query).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
)
|
||||
set_param(
|
||||
torch_layer.self_attention.key, torch.tensor(np_key).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
)
|
||||
set_param(
|
||||
torch_layer.self_attention.value, torch.tensor(np_value).transpose(1, 2).contiguous().view(-1, hidden_size),
|
||||
)
|
||||
set_param(
|
||||
torch_layer.output.dense, torch.tensor(np_dense).view(-1, hidden_size).contiguous().transpose(0, 1),
|
||||
)
|
||||
|
||||
|
||||
def set_block_weights_in_torch(weights, torch_block, hidden_size):
|
||||
# layernorm 1
|
||||
layer_norm_1 = weights[0][0][0]
|
||||
layer_norm_1_weight = np.asarray(layer_norm_1[0])
|
||||
layer_norm_1_bias = np.asarray(layer_norm_1[1])
|
||||
set_param(
|
||||
torch_block.attention.layer_norm, torch.tensor(layer_norm_1_weight), torch.tensor(layer_norm_1_bias),
|
||||
)
|
||||
|
||||
# lsh weights + output
|
||||
attn_weights = weights[0][1]
|
||||
if len(attn_weights) < 4:
|
||||
set_layer_weights_in_torch_lsh(attn_weights, torch_block.attention, hidden_size)
|
||||
else:
|
||||
set_layer_weights_in_torch_local(attn_weights, torch_block.attention, hidden_size)
|
||||
|
||||
# intermediate weighs
|
||||
intermediate_weights = weights[2][0][2][2]
|
||||
|
||||
# Chunked Feed Forward
|
||||
if len(intermediate_weights) == 4:
|
||||
intermediate_weights = intermediate_weights[2]
|
||||
|
||||
# layernorm 2
|
||||
layer_norm_2_weight = np.asarray(intermediate_weights[0][0])
|
||||
layer_norm_2_bias = np.asarray(intermediate_weights[0][1])
|
||||
set_param(
|
||||
torch_block.feed_forward.layer_norm, torch.tensor(layer_norm_2_weight), torch.tensor(layer_norm_2_bias),
|
||||
)
|
||||
|
||||
# intermediate dense
|
||||
inter_dense_weight = np.asarray(intermediate_weights[1][0])
|
||||
inter_dense_bias = np.asarray(intermediate_weights[1][1])
|
||||
set_param(
|
||||
torch_block.feed_forward.dense.dense,
|
||||
torch.tensor(inter_dense_weight).transpose(0, 1).contiguous(),
|
||||
torch.tensor(inter_dense_bias),
|
||||
)
|
||||
|
||||
# intermediate out
|
||||
out_dense_weight = np.asarray(intermediate_weights[4][0])
|
||||
out_dense_bias = np.asarray(intermediate_weights[4][1])
|
||||
set_param(
|
||||
torch_block.feed_forward.output.dense,
|
||||
torch.tensor(out_dense_weight).transpose(0, 1).contiguous(),
|
||||
torch.tensor(out_dense_bias),
|
||||
)
|
||||
|
||||
|
||||
def set_model_weights_in_torch(weights, torch_model, hidden_size):
|
||||
# reformer model
|
||||
torch_model_reformer = torch_model.reformer
|
||||
|
||||
# word embeds
|
||||
word_embeddings = np.asarray(weights[1])
|
||||
set_param(
|
||||
torch_model_reformer.embeddings.word_embeddings, torch.tensor(word_embeddings),
|
||||
)
|
||||
|
||||
if isinstance(weights[3], tuple):
|
||||
position_embeddings = torch_model_reformer.embeddings.position_embeddings
|
||||
for emb_idx in range(len(position_embeddings.weights)):
|
||||
emb_weights = np.asarray(weights[3][emb_idx][0])
|
||||
assert position_embeddings.weights[emb_idx].shape == emb_weights.shape, "{} emb does not match".format(
|
||||
position_embeddings[emb_idx]
|
||||
)
|
||||
position_embeddings.weights[emb_idx] = torch.nn.Parameter(torch.tensor(emb_weights))
|
||||
|
||||
trax_layer_weights = weights[5]
|
||||
assert len(torch_model_reformer.encoder.layers) * 4 + 1 == len(
|
||||
trax_layer_weights
|
||||
), "HF and trax model do not have the same number of layers"
|
||||
for layer_idx, layer in enumerate(torch_model_reformer.encoder.layers):
|
||||
block_weights = trax_layer_weights[4 * layer_idx : 4 * (layer_idx + 1)]
|
||||
set_block_weights_in_torch(block_weights, layer, hidden_size)
|
||||
|
||||
# output weights
|
||||
out_weights = weights[6]
|
||||
|
||||
# output layer norm
|
||||
layer_norm_out_weight = np.asarray(out_weights[0][0])
|
||||
layer_norm_out_bias = np.asarray(out_weights[0][1])
|
||||
set_param(
|
||||
torch_model_reformer.encoder.layer_norm,
|
||||
torch.tensor(layer_norm_out_weight),
|
||||
torch.tensor(layer_norm_out_bias),
|
||||
)
|
||||
|
||||
# output embeddings
|
||||
output_embed_weights = np.asarray(out_weights[2][0])
|
||||
output_embed_bias = np.asarray(out_weights[2][1])
|
||||
set_param(
|
||||
torch_model.lm_head.decoder,
|
||||
torch.tensor(output_embed_weights).transpose(0, 1).contiguous(),
|
||||
torch.tensor(output_embed_bias),
|
||||
)
|
||||
|
||||
|
||||
def convert_trax_checkpoint_to_pytorch(trax_model_pkl_path, config_file, pytorch_dump_path):
|
||||
# Initialise PyTorch model
|
||||
config = ReformerConfig.from_json_file(config_file)
|
||||
print("Building PyTorch model from configuration: {}".format(str(config)))
|
||||
model = ReformerModelWithLMHead(config)
|
||||
|
||||
with open(trax_model_pkl_path, "rb") as f:
|
||||
model_weights = pickle.load(f)["weights"]
|
||||
|
||||
set_model_weights_in_torch(model_weights, model, config.hidden_size)
|
||||
|
||||
# Save pytorch-model
|
||||
print("Save PyTorch model to {}".format(pytorch_dump_path))
|
||||
torch.save(model.state_dict(), pytorch_dump_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--trax_model_pkl_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config_file",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The config json file corresponding to the pre-trained Reformer model. \n"
|
||||
"This specifies the model architecture.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
convert_trax_checkpoint_to_pytorch(args.trax_model_pkl_path, args.config_file, args.pytorch_dump_path)
|
||||
@@ -31,6 +31,7 @@ from .configuration_auto import (
|
||||
FlaubertConfig,
|
||||
GPT2Config,
|
||||
OpenAIGPTConfig,
|
||||
ReformerConfig,
|
||||
RobertaConfig,
|
||||
T5Config,
|
||||
TransfoXLConfig,
|
||||
@@ -97,6 +98,7 @@ from .modeling_flaubert import (
|
||||
)
|
||||
from .modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2LMHeadModel, GPT2Model
|
||||
from .modeling_openai import OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTLMHeadModel, OpenAIGPTModel
|
||||
from .modeling_reformer import ReformerModel, ReformerModelWithLMHead
|
||||
from .modeling_roberta import (
|
||||
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
RobertaForMaskedLM,
|
||||
@@ -179,6 +181,7 @@ MODEL_MAPPING = OrderedDict(
|
||||
(XLMConfig, XLMModel),
|
||||
(CTRLConfig, CTRLModel),
|
||||
(ElectraConfig, ElectraModel),
|
||||
(ReformerConfig, ReformerModel),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -222,6 +225,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
|
||||
(CTRLConfig, CTRLLMHeadModel),
|
||||
(ElectraConfig, ElectraForMaskedLM),
|
||||
(EncoderDecoderConfig, EncoderDecoderModel),
|
||||
(ReformerConfig, ReformerModelWithLMHead),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
1763
src/transformers/modeling_reformer.py
Normal file
1763
src/transformers/modeling_reformer.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -13,8 +13,8 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch BERT model."""
|
||||
|
||||
import inspect
|
||||
import logging
|
||||
import os
|
||||
from typing import Callable, Tuple
|
||||
@@ -175,7 +175,7 @@ class ModuleUtilsMixin:
|
||||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
||||
return extended_attention_mask
|
||||
|
||||
def get_head_mask(self, head_mask, num_hidden_layers):
|
||||
def get_head_mask(self, head_mask, num_hidden_layers, is_attention_chunked=False):
|
||||
"""
|
||||
# Prepare head mask if needed
|
||||
# 1.0 in head_mask indicate we keep the head
|
||||
@@ -189,6 +189,8 @@ class ModuleUtilsMixin:
|
||||
"""
|
||||
if head_mask is not None:
|
||||
head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
|
||||
if is_attention_chunked is True:
|
||||
head_mask = head_mask.unsqueeze(-1)
|
||||
else:
|
||||
head_mask = [None] * num_hidden_layers
|
||||
|
||||
@@ -786,6 +788,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
|
||||
attention_mask=None,
|
||||
decoder_start_token_id=None,
|
||||
use_cache=None,
|
||||
**model_specific_kwargs
|
||||
):
|
||||
r""" Generates sequences for models with a LM head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling.
|
||||
|
||||
@@ -863,6 +866,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
|
||||
use_cache: (`optional`) bool
|
||||
If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`.
|
||||
|
||||
model_specific_kwargs: (`optional`) dict
|
||||
Additional model specific kwargs will be forwarded to the `forward` function of the model.
|
||||
|
||||
Return:
|
||||
|
||||
output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`
|
||||
@@ -1116,6 +1122,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
|
||||
encoder_outputs=encoder_outputs,
|
||||
attention_mask=attention_mask,
|
||||
use_cache=use_cache,
|
||||
model_specific_kwargs=model_specific_kwargs,
|
||||
)
|
||||
else:
|
||||
output = self._generate_no_beam_search(
|
||||
@@ -1138,6 +1145,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
|
||||
encoder_outputs=encoder_outputs,
|
||||
attention_mask=attention_mask,
|
||||
use_cache=use_cache,
|
||||
model_specific_kwargs=model_specific_kwargs,
|
||||
)
|
||||
|
||||
return output
|
||||
@@ -1163,6 +1171,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
|
||||
encoder_outputs,
|
||||
attention_mask,
|
||||
use_cache,
|
||||
model_specific_kwargs,
|
||||
):
|
||||
""" Generate sequences for each example without beam search (num_beams == 1).
|
||||
All returned sequence are generated independantly.
|
||||
@@ -1175,7 +1184,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
|
||||
|
||||
while cur_len < max_length:
|
||||
model_inputs = self.prepare_inputs_for_generation(
|
||||
input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache
|
||||
input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs
|
||||
)
|
||||
|
||||
outputs = self(**model_inputs)
|
||||
@@ -1288,6 +1297,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
|
||||
encoder_outputs,
|
||||
attention_mask,
|
||||
use_cache,
|
||||
model_specific_kwargs,
|
||||
):
|
||||
""" Generate sequences for each example with beam search.
|
||||
"""
|
||||
@@ -1314,7 +1324,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin):
|
||||
|
||||
while cur_len < max_length:
|
||||
model_inputs = self.prepare_inputs_for_generation(
|
||||
input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache
|
||||
input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs
|
||||
)
|
||||
outputs = self(**model_inputs) # (batch_size * num_beams, cur_len, vocab_size)
|
||||
next_token_logits = outputs[0][:, -1, :] # (batch_size * num_beams, vocab_size)
|
||||
@@ -2087,3 +2097,66 @@ def prune_layer(layer, index, dim=None):
|
||||
return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
|
||||
else:
|
||||
raise ValueError("Can't prune layer of class {}".format(layer.__class__))
|
||||
|
||||
|
||||
def apply_chunking_to_forward(
|
||||
chunk_size: int, chunk_dim: int, forward_fn: Callable[..., torch.Tensor], *input_tensors
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension `chunk_dim`.
|
||||
It then applies a layer `forward_fn` to each chunk independently to save memory.
|
||||
If the `forward_fn` is independent across the `chunk_dim` this function will yield the
|
||||
same result as not applying it.
|
||||
|
||||
Args:
|
||||
chunk_size: int - the chunk size of a chunked tensor. `num_chunks` = `len(input_tensors[0]) / chunk_size`
|
||||
chunk_dim: int - the dimension over which the input_tensors should be chunked
|
||||
forward_fn: fn - the forward fn of the model
|
||||
input_tensors: tuple(torch.Tensor) - the input tensors of `forward_fn` which are chunked
|
||||
Returns:
|
||||
a Tensor with the same shape the foward_fn would have given if applied
|
||||
|
||||
|
||||
Examples::
|
||||
|
||||
# rename the usual forward() fn to forward_chunk()
|
||||
def forward_chunk(self, hidden_states):
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
# implement a chunked forward function
|
||||
def forward(self, hidden_states):
|
||||
return apply_chunking_to_forward(self.chunk_size_lm_head, self.seq_len_dim, self.forward_chunk, hidden_states)
|
||||
"""
|
||||
|
||||
assert len(input_tensors) > 0, "{} has to be a tuple/list of tensors".format(input_tensors)
|
||||
tensor_shape = input_tensors[0].shape
|
||||
assert all(
|
||||
input_tensor.shape == tensor_shape for input_tensor in input_tensors
|
||||
), "All input tenors have to be of the same shape"
|
||||
|
||||
# inspect.signature exist since python 3.5 and is a python method -> no problem with backward compability
|
||||
num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
|
||||
assert num_args_in_forward_chunk_fn == len(
|
||||
input_tensors
|
||||
), "forward_chunk_fn expects {} arguments, but only {} input tensors are given".format(
|
||||
num_args_in_forward_chunk_fn, len(input_tensors)
|
||||
)
|
||||
|
||||
if chunk_size > 0:
|
||||
assert (
|
||||
input_tensors[0].shape[chunk_dim] % chunk_size == 0
|
||||
), "The dimension to be chunked {} has to be a multiple of the chunk size {}".format(
|
||||
input_tensors[0][chunk_dim], chunk_size
|
||||
)
|
||||
|
||||
num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
|
||||
|
||||
# chunk input tensor into tuples
|
||||
input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors)
|
||||
# apply forward fn to every tuple
|
||||
output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
|
||||
# concatenate output at same dimension
|
||||
return torch.cat(output_chunks, dim=chunk_dim)
|
||||
|
||||
return forward_fn(*input_tensors)
|
||||
|
||||
@@ -30,6 +30,7 @@ from .configuration_auto import (
|
||||
FlaubertConfig,
|
||||
GPT2Config,
|
||||
OpenAIGPTConfig,
|
||||
ReformerConfig,
|
||||
RobertaConfig,
|
||||
T5Config,
|
||||
TransfoXLConfig,
|
||||
@@ -49,6 +50,7 @@ from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
|
||||
from .tokenization_flaubert import FlaubertTokenizer
|
||||
from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
|
||||
from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
|
||||
from .tokenization_reformer import ReformerTokenizer
|
||||
from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
|
||||
from .tokenization_t5 import T5Tokenizer
|
||||
from .tokenization_transfo_xl import TransfoXLTokenizer, TransfoXLTokenizerFast
|
||||
@@ -69,6 +71,7 @@ TOKENIZER_MAPPING = OrderedDict(
|
||||
(XLMRobertaConfig, (XLMRobertaTokenizer, None)),
|
||||
(BartConfig, (BartTokenizer, None)),
|
||||
(RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
|
||||
(ReformerConfig, (ReformerTokenizer, None)),
|
||||
(ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)),
|
||||
(BertConfig, (BertTokenizer, BertTokenizerFast)),
|
||||
(OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)),
|
||||
|
||||
179
src/transformers/tokenization_reformer.py
Normal file
179
src/transformers/tokenization_reformer.py
Normal file
@@ -0,0 +1,179 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The Trax Authors and The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Tokenization class for model Reformer."""
|
||||
|
||||
|
||||
import logging
|
||||
import os
|
||||
from shutil import copyfile
|
||||
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SPIECE_UNDERLINE = "▁"
|
||||
|
||||
|
||||
####################################################
|
||||
# Mapping from the keyword arguments names of Tokenizer `__init__`
|
||||
# to file names for serializing Tokenizer instances
|
||||
####################################################
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
|
||||
|
||||
####################################################
|
||||
# Mapping from the keyword arguments names of Tokenizer `__init__`
|
||||
# to pretrained vocabulary URL for all the model shortcut names.
|
||||
####################################################
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/spiece.model"
|
||||
}
|
||||
}
|
||||
|
||||
####################################################
|
||||
# Mapping from model shortcut names to max length of inputs
|
||||
####################################################
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
"google/reformer-crime-and-punishment": 524288,
|
||||
}
|
||||
|
||||
|
||||
class ReformerTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
Constructs an Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||
should refer to the superclass for more information regarding methods.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`string`):
|
||||
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
|
||||
contains the vocabulary necessary to instantiate a tokenizer.
|
||||
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
|
||||
The end of sequence token.
|
||||
|
||||
.. note::
|
||||
|
||||
When building a sequence using special tokens, this is not the token that is used for the end
|
||||
of sequence. The token used is the :obj:`sep_token`.
|
||||
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`None`):
|
||||
Additional special tokens used by the tokenizer.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
eos_token="</s>",
|
||||
unk_token="<unk>",
|
||||
pad_token="<pad>",
|
||||
additional_special_tokens=[],
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
try:
|
||||
import sentencepiece as spm
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"You need to install SentencePiece to use ReformerTokenizer:"
|
||||
"https://github.com/google/sentencepiece"
|
||||
"pip install sentencepiece"
|
||||
)
|
||||
raise
|
||||
|
||||
self.vocab_file = vocab_file
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self.sp_model.get_piece_size()
|
||||
|
||||
def get_vocab(self):
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__ = d
|
||||
try:
|
||||
import sentencepiece as spm
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"You need to install SentencePiece to use ReformerTokenizer: https://github.com/google/sentencepiece"
|
||||
"pip install sentencepiece"
|
||||
)
|
||||
raise
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model.Load(self.vocab_file)
|
||||
|
||||
def _tokenize(self, text, sample=False):
|
||||
""" Take as input a string and return a list of strings (tokens) for words/sub-words
|
||||
"""
|
||||
if not sample:
|
||||
pieces = self.sp_model.EncodeAsPieces(text)
|
||||
else:
|
||||
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
|
||||
return pieces
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
return self.sp_model.piece_to_id(token)
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
if index < self.sp_model.get_piece_size():
|
||||
token = self.sp_model.IdToPiece(index)
|
||||
return token
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
""" Converts a sequence of tokens (string) in a single string. """
|
||||
out_string = self.sp_model.decode_pieces(tokens)
|
||||
return out_string
|
||||
|
||||
def save_vocabulary(self, save_directory):
|
||||
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
||||
to a directory.
|
||||
"""
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||
return
|
||||
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
|
||||
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
|
||||
copyfile(self.vocab_file, out_vocab_file)
|
||||
|
||||
return (out_vocab_file,)
|
||||
Reference in New Issue
Block a user