Merge branch 'master' into RoBERTa
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
__version__ = "1.0.0"
|
||||
from .tokenization_auto import AutoTokenizer
|
||||
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
||||
from .tokenization_openai import OpenAIGPTTokenizer
|
||||
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
|
||||
@@ -8,6 +9,10 @@ from .tokenization_xlm import XLMTokenizer
|
||||
from .tokenization_roberta import RobertaTokenizer
|
||||
from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization)
|
||||
|
||||
from .tokenization_utils import (PreTrainedTokenizer)
|
||||
|
||||
from .modeling_auto import (AutoConfig, AutoModel)
|
||||
|
||||
from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
|
||||
BertForMaskedLM, BertForNextSentencePrediction,
|
||||
BertForSequenceClassification, BertForMultipleChoice,
|
||||
@@ -42,4 +47,4 @@ from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
|
||||
from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
|
||||
WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
|
||||
|
||||
from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
|
||||
from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
|
||||
|
||||
@@ -58,7 +58,7 @@ if __name__ == "__main__":
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path the TensorFlow checkpoint path.")
|
||||
help = "Path to the TensorFlow checkpoint path.")
|
||||
parser.add_argument("--pytorch_dump_folder_path",
|
||||
default = None,
|
||||
type = str,
|
||||
|
||||
@@ -58,7 +58,7 @@ if __name__ == "__main__":
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path the TensorFlow checkpoint path.")
|
||||
help = "Path to the TensorFlow checkpoint path.")
|
||||
parser.add_argument("--pytorch_dump_folder_path",
|
||||
default = None,
|
||||
type = str,
|
||||
|
||||
@@ -20,7 +20,7 @@ import argparse
|
||||
import torch
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from pytorch_pretrained_bert.modeling import BertModel
|
||||
from pytorch_transformers.modeling import BertModel
|
||||
|
||||
|
||||
def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
|
||||
@@ -41,7 +41,7 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
|
||||
N BertForQuestionAnswering
|
||||
"""
|
||||
|
||||
tensors_to_transopse = (
|
||||
tensors_to_transpose = (
|
||||
"dense.weight",
|
||||
"attention.self.query",
|
||||
"attention.self.key",
|
||||
@@ -62,34 +62,34 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
|
||||
if not os.path.isdir(ckpt_dir):
|
||||
os.makedirs(ckpt_dir)
|
||||
|
||||
session = tf.Session()
|
||||
state_dict = model.state_dict()
|
||||
tf_vars = []
|
||||
|
||||
def to_tf_var_name(name:str):
|
||||
for patt, repl in iter(var_map):
|
||||
name = name.replace(patt, repl)
|
||||
return 'bert/{}'.format(name)
|
||||
|
||||
def assign_tf_var(tensor:np.ndarray, name:str):
|
||||
tmp_var = tf.Variable(initial_value=tensor)
|
||||
tf_var = tf.get_variable(dtype=tmp_var.dtype, shape=tmp_var.shape, name=name)
|
||||
op = tf.assign(ref=tf_var, value=tmp_var)
|
||||
session.run(tf.variables_initializer([tmp_var, tf_var]))
|
||||
session.run(fetches=[op, tf_var])
|
||||
def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session):
|
||||
tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
|
||||
tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
|
||||
session.run(tf.variables_initializer([tf_var]))
|
||||
session.run(tf_var)
|
||||
return tf_var
|
||||
|
||||
for var_name in state_dict:
|
||||
tf_name = to_tf_var_name(var_name)
|
||||
torch_tensor = state_dict[var_name].numpy()
|
||||
if any([x in var_name for x in tensors_to_transopse]):
|
||||
torch_tensor = torch_tensor.T
|
||||
tf_tensor = assign_tf_var(tensor=torch_tensor, name=tf_name)
|
||||
tf_vars.append(tf_tensor)
|
||||
print("{0}{1}initialized".format(tf_name, " " * (60 - len(tf_name))))
|
||||
tf.reset_default_graph()
|
||||
with tf.Session() as session:
|
||||
for var_name in state_dict:
|
||||
tf_name = to_tf_var_name(var_name)
|
||||
torch_tensor = state_dict[var_name].numpy()
|
||||
if any([x in var_name for x in tensors_to_transpose]):
|
||||
torch_tensor = torch_tensor.T
|
||||
tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
|
||||
tf.keras.backend.set_value(tf_var, torch_tensor)
|
||||
tf_weight = session.run(tf_var)
|
||||
print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
|
||||
|
||||
saver = tf.train.Saver(tf_vars)
|
||||
saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
|
||||
saver = tf.train.Saver(tf.trainable_variables())
|
||||
saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
|
||||
|
||||
|
||||
def main(raw_args=None):
|
||||
|
||||
@@ -47,7 +47,7 @@ if __name__ == "__main__":
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path the TensorFlow checkpoint path.")
|
||||
help = "Path to the TensorFlow checkpoint path.")
|
||||
parser.add_argument("--bert_config_file",
|
||||
default = None,
|
||||
type = str,
|
||||
|
||||
@@ -24,11 +24,10 @@ from io import open
|
||||
import torch
|
||||
|
||||
import pytorch_transformers.tokenization_transfo_xl as data_utils
|
||||
from pytorch_transformers.modeling_transfo_xl import (CONFIG_NAME,
|
||||
WEIGHTS_NAME,
|
||||
TransfoXLConfig,
|
||||
TransfoXLLMHeadModel,
|
||||
load_tf_weights_in_transfo_xl)
|
||||
|
||||
from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
|
||||
from pytorch_transformers.modeling_transfo_xl import (TransfoXLConfig, TransfoXLLMHeadModel,
|
||||
load_tf_weights_in_transfo_xl)
|
||||
from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
|
||||
@@ -79,7 +79,7 @@ if __name__ == "__main__":
|
||||
default = None,
|
||||
type = str,
|
||||
required = True,
|
||||
help = "Path the TensorFlow checkpoint path.")
|
||||
help = "Path to the TensorFlow checkpoint path.")
|
||||
parser.add_argument("--xlnet_config_file",
|
||||
default = None,
|
||||
type = str,
|
||||
|
||||
@@ -38,10 +38,13 @@ except ImportError:
|
||||
try:
|
||||
from pathlib import Path
|
||||
PYTORCH_PRETRAINED_BERT_CACHE = Path(
|
||||
os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))
|
||||
os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)))
|
||||
except (AttributeError, ImportError):
|
||||
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
|
||||
default_cache_path)
|
||||
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE',
|
||||
os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
|
||||
default_cache_path))
|
||||
|
||||
PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
|
||||
|
||||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
@@ -70,7 +73,7 @@ def filename_to_url(filename, cache_dir=None):
|
||||
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
|
||||
cache_dir = PYTORCH_TRANSFORMERS_CACHE
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
|
||||
@@ -98,7 +101,7 @@ def cached_path(url_or_filename, cache_dir=None):
|
||||
make sure the file exists and then return the path.
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
|
||||
cache_dir = PYTORCH_TRANSFORMERS_CACHE
|
||||
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
|
||||
url_or_filename = str(url_or_filename)
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
@@ -187,7 +190,7 @@ def get_from_cache(url, cache_dir=None):
|
||||
If it's not there, download it. Then return the path to the cached file.
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
|
||||
cache_dir = PYTORCH_TRANSFORMERS_CACHE
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
|
||||
|
||||
236
pytorch_transformers/modeling_auto.py
Normal file
236
pytorch_transformers/modeling_auto.py
Normal file
@@ -0,0 +1,236 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Auto Model class. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import CrossEntropyLoss, MSELoss
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from .modeling_bert import BertConfig, BertModel
|
||||
from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel
|
||||
from .modeling_gpt2 import GPT2Config, GPT2Model
|
||||
from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
|
||||
from .modeling_xlnet import XLNetConfig, XLNetModel
|
||||
from .modeling_xlm import XLMConfig, XLMModel
|
||||
|
||||
from .modeling_utils import PreTrainedModel, SequenceSummary
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class AutoConfig(object):
|
||||
r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
|
||||
that will be instantiated as one of the configuration classes of the library
|
||||
when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
The `from_pretrained()` method take care of returning the correct model class instance
|
||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
|
||||
The base model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `bert`: BertConfig (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||
- contains `xlm`: XLMConfig (XLM model)
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throw an error).
|
||||
"""
|
||||
def __init__(self):
|
||||
raise EnvironmentError("AutoConfig is designed to be instantiated "
|
||||
"using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||
r""" Instantiate a one of the configuration classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The configuration class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `bert`: BertConfig (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||
- contains `xlm`: XLMConfig (XLM model)
|
||||
|
||||
Params:
|
||||
**pretrained_model_name_or_path**: either:
|
||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
|
||||
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
|
||||
- a path to a `directory` containing a configuration file saved
|
||||
using the `save_pretrained(save_directory)` method.
|
||||
- a path or url to a saved configuration `file`.
|
||||
**cache_dir**: (`optional`) string:
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
**return_unused_kwargs**: (`optional`) bool:
|
||||
- If False, then this function returns just the final configuration object.
|
||||
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
|
||||
is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
|
||||
ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
|
||||
**kwargs**: (`optional`) dict:
|
||||
Dictionary of key/value pairs with which to update the configuration object after loading.
|
||||
- The values in kwargs of any keys which are configuration attributes will be used
|
||||
to override the loaded values.
|
||||
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
|
||||
by the `return_unused_kwargs` keyword parameter.
|
||||
|
||||
Examples::
|
||||
|
||||
config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
|
||||
config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
||||
config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
|
||||
config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
|
||||
assert config.output_attention == True
|
||||
config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
|
||||
foo=False, return_unused_kwargs=True)
|
||||
assert config.output_attention == True
|
||||
assert unused_kwargs == {'foo': False}
|
||||
|
||||
"""
|
||||
if 'bert' in pretrained_model_name_or_path:
|
||||
return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||
return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
elif 'gpt2' in pretrained_model_name_or_path:
|
||||
return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
elif 'transfo-xl' in pretrained_model_name_or_path:
|
||||
return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
elif 'xlnet' in pretrained_model_name_or_path:
|
||||
return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
elif 'xlm' in pretrained_model_name_or_path:
|
||||
return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||
"'xlm'".format(pretrained_model_name_or_path))
|
||||
|
||||
|
||||
class AutoModel(object):
|
||||
r"""
|
||||
:class:`~pytorch_transformers.AutoModel` is a generic model class
|
||||
that will be instantiated as one of the base model classes of the library
|
||||
when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
The `from_pretrained()` method take care of returning the correct model class instance
|
||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
|
||||
The base model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `bert`: BertConfig (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||
- contains `xlm`: XLMConfig (XLM model)
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throw an error).
|
||||
"""
|
||||
def __init__(self):
|
||||
raise EnvironmentError("AutoModel is designed to be instantiated "
|
||||
"using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r""" Instantiate a one of the base model classes of the library
|
||||
from a pre-trained model configuration.
|
||||
|
||||
The base model class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `bert`: BertConfig (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetConfig (XLNet model)
|
||||
- contains `xlm`: XLMConfig (XLM model)
|
||||
|
||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
|
||||
To train the model, you should first set it back in training mode with `model.train()`
|
||||
|
||||
Params:
|
||||
**pretrained_model_name_or_path**: either:
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache
|
||||
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
|
||||
- a path to a `directory` containing a configuration file saved
|
||||
using the `save_pretrained(save_directory)` method.
|
||||
- a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
|
||||
In this case, ``from_tf`` should be set to True and a configuration object should be
|
||||
provided as `config` argument. This loading option is slower than converting the TensorFlow
|
||||
checkpoint in a PyTorch model using the provided conversion scripts and loading
|
||||
the PyTorch model afterwards.
|
||||
**model_args**: (`optional`) Sequence:
|
||||
All remaning positional arguments will be passed to the underlying model's __init__ function
|
||||
**config**: an optional configuration for the model to use instead of an automatically loaded configuation.
|
||||
Configuration can be automatically loaded when:
|
||||
- the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
|
||||
- the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
|
||||
**state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
|
||||
from saved weights file.
|
||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||
In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
|
||||
a simpler option.
|
||||
**cache_dir**: (`optional`) string:
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
**output_loading_info**: (`optional`) boolean:
|
||||
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||
**kwargs**: (`optional`) dict:
|
||||
Dictionary of key, values to update the configuration object after loading.
|
||||
Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
|
||||
|
||||
- If a configuration is provided with `config`, **kwargs will be directly passed
|
||||
to the underlying model's __init__ method.
|
||||
- If a configuration is not provided, **kwargs will be first passed to the pretrained
|
||||
model configuration class loading function (`PretrainedConfig.from_pretrained`).
|
||||
Each key of **kwargs that corresponds to a configuration attribute
|
||||
will be used to override said attribute with the supplied **kwargs value.
|
||||
Remaining keys that do not correspond to any configuration attribute will
|
||||
be passed to the underlying model's __init__ function.
|
||||
|
||||
Examples::
|
||||
|
||||
model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||
model = AutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||
model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||
assert model.config.output_attention == True
|
||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||
config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
|
||||
model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
|
||||
"""
|
||||
if 'bert' in pretrained_model_name_or_path:
|
||||
return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||
return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'gpt2' in pretrained_model_name_or_path:
|
||||
return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'transfo-xl' in pretrained_model_name_or_path:
|
||||
return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'xlnet' in pretrained_model_name_or_path:
|
||||
return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
elif 'xlm' in pretrained_model_name_or_path:
|
||||
return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
|
||||
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||
"'xlm'".format(pretrained_model_name_or_path))
|
||||
@@ -222,7 +222,7 @@ class BertConfig(PretrainedConfig):
|
||||
|
||||
try:
|
||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
|
||||
except ImportError:
|
||||
except (ImportError, AttributeError) as e:
|
||||
logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
|
||||
class BertLayerNorm(nn.Module):
|
||||
def __init__(self, hidden_size, eps=1e-12):
|
||||
@@ -643,12 +643,11 @@ class BertModel(BertPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = BertConfig.from_pretrained('bert-base-uncased')
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>> model = BertModel(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertModel.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -754,13 +753,11 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = BertConfig.from_pretrained('bert-base-uncased')
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>>
|
||||
>>> model = BertForPreTraining(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
>>> prediction_scores, seq_relationship_scores = outputs[:2]
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForPreTraining.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
prediction_scores, seq_relationship_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -824,13 +821,11 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = BertConfig.from_pretrained('bert-base-uncased')
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>>
|
||||
>>> model = BertForMaskedLM(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||
>>> loss, prediction_scores = outputs[:2]
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||
loss, prediction_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -857,7 +852,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
sequence_output = outputs[0]
|
||||
prediction_scores = self.cls(sequence_output)
|
||||
|
||||
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention is they are here
|
||||
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
||||
if masked_lm_labels is not None:
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||
@@ -891,13 +886,11 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = BertConfig.from_pretrained('bert-base-uncased')
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>>
|
||||
>>> model = BertForNextSentencePrediction(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
>>> seq_relationship_scores = outputs[0]
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
seq_relationship_scores = outputs[0]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -951,14 +944,12 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = BertConfig.from_pretrained('bert-base-uncased')
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>>
|
||||
>>> model = BertForSequenceClassification(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids, labels=labels)
|
||||
>>> loss, logits = outputs[:2]
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, logits = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -1057,15 +1048,13 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = BertConfig.from_pretrained('bert-base-uncased')
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>>
|
||||
>>> model = BertForMultipleChoice(config)
|
||||
>>> choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
|
||||
>>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||
>>> labels = torch.tensor(1).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids, labels=labels)
|
||||
>>> loss, classification_scores = outputs[:2]
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
|
||||
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
|
||||
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, classification_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -1127,14 +1116,12 @@ class BertForTokenClassification(BertPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = BertConfig.from_pretrained('bert-base-uncased')
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>>
|
||||
>>> model = BertForTokenClassification(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids, labels=labels)
|
||||
>>> loss, scores = outputs[:2]
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, scores = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -1203,15 +1190,13 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = BertConfig.from_pretrained('bert-base-uncased')
|
||||
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
>>>
|
||||
>>> model = BertForQuestionAnswering(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> start_positions = torch.tensor([1])
|
||||
>>> end_positions = torch.tensor([3])
|
||||
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||
>>> loss, start_scores, end_scores = outputs[:2]
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
start_positions = torch.tensor([1])
|
||||
end_positions = torch.tensor([3])
|
||||
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||
loss, start_scores, end_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
|
||||
@@ -137,7 +137,7 @@ class GPT2Config(PretrainedConfig):
|
||||
initializer_range=0.02,
|
||||
|
||||
num_labels=1,
|
||||
summary_type='token_ids',
|
||||
summary_type='cls_index',
|
||||
summary_use_proj=True,
|
||||
summary_activation=None,
|
||||
summary_proj_to_labels=True,
|
||||
@@ -433,12 +433,11 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = GPT2Config.from_pretrained('gpt2')
|
||||
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
>>> model = GPT2Model(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
model = GPT2Model.from_pretrained('gpt2')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -567,12 +566,11 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = GPT2Config.from_pretrained('gpt2')
|
||||
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
>>> model = GPT2LMHeadModel(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids, labels=input_ids)
|
||||
>>> loss, logits = outputs[:2]
|
||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=input_ids)
|
||||
loss, logits = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -683,14 +681,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = GPT2Config.from_pretrained('gpt2')
|
||||
>>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
>>> model = GPT2DoubleHeadsModel(config)
|
||||
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] # Assume you've added [CLS] to the vocabulary
|
||||
>>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||
>>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids, mc_token_ids)
|
||||
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
|
||||
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] # Assume you've added [CLS] to the vocabulary
|
||||
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||
mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, mc_token_ids)
|
||||
lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
|
||||
@@ -171,7 +171,7 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
predict_special_tokens=True,
|
||||
|
||||
num_labels=1,
|
||||
summary_type='token_ids',
|
||||
summary_type='cls_index',
|
||||
summary_use_proj=True,
|
||||
summary_activation=None,
|
||||
summary_proj_to_labels=True,
|
||||
@@ -439,12 +439,11 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
|
||||
>>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
|
||||
>>> model = OpenAIGPTModel(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
|
||||
model = OpenAIGPTModel.from_pretrained('openai-gpt')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -558,12 +557,11 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
|
||||
>>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
|
||||
>>> model = OpenAIGPTLMHeadModel(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids, labels=input_ids)
|
||||
>>> loss, logits = outputs[:2]
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
|
||||
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=input_ids)
|
||||
loss, logits = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -665,14 +663,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
|
||||
>>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
|
||||
>>> model = OpenAIGPTDoubleHeadsModel(config)
|
||||
>>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] # Assume you've added [CLS] to the vocabulary
|
||||
>>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||
>>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids, mc_token_ids)
|
||||
>>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
|
||||
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
|
||||
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] # Assume you've added [CLS] to the vocabulary
|
||||
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
|
||||
mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, mc_token_ids)
|
||||
lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
|
||||
@@ -968,12 +968,11 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
|
||||
>>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
|
||||
>>> model = TransfoXLModel(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
>>> last_hidden_states, mems = outputs[:2]
|
||||
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
|
||||
model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states, mems = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -1284,12 +1283,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
|
||||
>>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
|
||||
>>> model = TransfoXLLMHeadModel(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
>>> prediction_scores, mems = outputs[:2]
|
||||
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
|
||||
model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
prediction_scores, mems = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
|
||||
@@ -68,8 +68,18 @@ else:
|
||||
|
||||
|
||||
class PretrainedConfig(object):
|
||||
""" Base class for all configuration classes.
|
||||
Handle a few common parameters and methods for loading/downloading/saving configurations.
|
||||
r""" Base class for all configuration classes.
|
||||
Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
|
||||
|
||||
Class attributes (overridden by derived classes):
|
||||
- ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
|
||||
|
||||
Parameters:
|
||||
``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
|
||||
``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
|
||||
``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
|
||||
``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
|
||||
``torchscript``: string, default `False`. Is the model used with Torchscript.
|
||||
"""
|
||||
pretrained_config_archive_map = {}
|
||||
|
||||
@@ -81,8 +91,8 @@ class PretrainedConfig(object):
|
||||
self.torchscript = kwargs.pop('torchscript', False)
|
||||
|
||||
def save_pretrained(self, save_directory):
|
||||
""" Save a configuration object to a directory, so that it
|
||||
can be re-loaded using the `from_pretrained(save_directory)` class method.
|
||||
""" Save a configuration object to the directory `save_directory`, so that it
|
||||
can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
|
||||
"""
|
||||
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
|
||||
|
||||
@@ -93,41 +103,42 @@ class PretrainedConfig(object):
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||
r""" Instantiate a PretrainedConfig from a pre-trained model configuration.
|
||||
r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
|
||||
|
||||
Params:
|
||||
**pretrained_model_name_or_path**: either:
|
||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
|
||||
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
|
||||
- a path to a `directory` containing a configuration file saved
|
||||
using the `save_pretrained(save_directory)` method.
|
||||
- a path or url to a saved configuration `file`.
|
||||
**cache_dir**: (`optional`) string:
|
||||
Parameters:
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
|
||||
|
||||
cache_dir: (`optional`) string:
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
**return_unused_kwargs**: (`optional`) bool:
|
||||
|
||||
kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
|
||||
|
||||
- The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
|
||||
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
|
||||
|
||||
return_unused_kwargs: (`optional`) bool:
|
||||
|
||||
- If False, then this function returns just the final configuration object.
|
||||
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
|
||||
is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
|
||||
ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
|
||||
**kwargs**: (`optional`) dict:
|
||||
Dictionary of key/value pairs with which to update the configuration object after loading.
|
||||
- The values in kwargs of any keys which are configuration attributes will be used
|
||||
to override the loaded values.
|
||||
- Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
|
||||
by the `return_unused_kwargs` keyword parameter.
|
||||
- If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
|
||||
>>> config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
||||
>>> config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
|
||||
>>> config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
|
||||
>>> assert config.output_attention == True
|
||||
>>> config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
|
||||
>>> foo=False, return_unused_kwargs=True)
|
||||
>>> assert config.output_attention == True
|
||||
>>> assert unused_kwargs == {'foo': False}
|
||||
# We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
|
||||
# derived class: BertConfig
|
||||
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
|
||||
config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
|
||||
config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
|
||||
config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
|
||||
assert config.output_attention == True
|
||||
config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
|
||||
foo=False, return_unused_kwargs=True)
|
||||
assert config.output_attention == True
|
||||
assert unused_kwargs == {'foo': False}
|
||||
|
||||
"""
|
||||
cache_dir = kwargs.pop('cache_dir', None)
|
||||
@@ -217,14 +228,26 @@ class PretrainedConfig(object):
|
||||
|
||||
|
||||
class PreTrainedModel(nn.Module):
|
||||
""" Base class for all models. Handle loading/storing model config and
|
||||
a simple interface for dowloading and loading pretrained models.
|
||||
r""" Base class for all models.
|
||||
|
||||
:class:`~pytorch_transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
|
||||
as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
|
||||
|
||||
Class attributes (overridden by derived classes):
|
||||
- ``config_class``: a class derived from :class:`~pytorch_transformers.PretrainedConfig` to use as configuration class for this model architecture.
|
||||
- ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
|
||||
- ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
|
||||
|
||||
- ``model``: an instance of the relevant subclass of :class:`~pytorch_transformers.PreTrainedModel`,
|
||||
- ``config``: an instance of the relevant subclass of :class:`~pytorch_transformers.PretrainedConfig`,
|
||||
- ``path``: a path (string) to the TensorFlow checkpoint.
|
||||
|
||||
- ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
|
||||
"""
|
||||
config_class = PretrainedConfig
|
||||
config_class = None
|
||||
pretrained_model_archive_map = {}
|
||||
load_tf_weights = lambda model, config, path: None
|
||||
base_model_prefix = ""
|
||||
input_embeddings = None
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(PreTrainedModel, self).__init__()
|
||||
@@ -282,17 +305,16 @@ class PreTrainedModel(nn.Module):
|
||||
|
||||
def resize_token_embeddings(self, new_num_tokens=None):
|
||||
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
|
||||
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
|
||||
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
|
||||
|
||||
Args:
|
||||
new_num_tokens: (`optional`) int
|
||||
New number of tokens in the embedding matrix.
|
||||
Increasing the size will add newly initialized vectors at the end
|
||||
Reducing the size will remove vectors from the end
|
||||
If not provided or None: does nothing and just returns a pointer to the input tokens Embedding Module of the model.
|
||||
Arguments:
|
||||
|
||||
new_num_tokens: (`optional`) int:
|
||||
New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
|
||||
If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
|
||||
|
||||
Return: ``torch.nn.Embeddings``
|
||||
Pointer to the input tokens Embedding Module of the model
|
||||
Pointer to the input tokens Embeddings Module of the model
|
||||
"""
|
||||
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
|
||||
model_embeds = base_model._resize_token_embeddings(new_num_tokens)
|
||||
@@ -311,15 +333,17 @@ class PreTrainedModel(nn.Module):
|
||||
|
||||
def prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the base model.
|
||||
Args:
|
||||
heads_to_prune: dict of {layer_num (int): list of heads to prune in this layer (list of int)}
|
||||
|
||||
Arguments:
|
||||
|
||||
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
|
||||
"""
|
||||
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
|
||||
base_model._prune_heads(heads_to_prune)
|
||||
|
||||
def save_pretrained(self, save_directory):
|
||||
""" Save a model with its configuration file to a directory, so that it
|
||||
can be re-loaded using the `from_pretrained(save_directory)` class method.
|
||||
""" Save a model and its configuration file to a directory, so that it
|
||||
can be re-loaded using the `:func:`~pytorch_transformers.PreTrainedModel.from_pretrained`` class method.
|
||||
"""
|
||||
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
|
||||
|
||||
@@ -338,58 +362,53 @@ class PreTrainedModel(nn.Module):
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
|
||||
r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
|
||||
|
||||
The model is set in evaluation mode by default using `model.eval()` (Dropout modules are desactivated)
|
||||
To train the model, you should first set it back in training mode with `model.train()`
|
||||
The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
|
||||
To train the model, you should first set it back in training mode with ``model.train()``
|
||||
|
||||
Params:
|
||||
**pretrained_model_name_or_path**: either:
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache
|
||||
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
|
||||
- a path to a `directory` containing a configuration file saved
|
||||
using the `save_pretrained(save_directory)` method.
|
||||
- a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
|
||||
In this case, ``from_tf`` should be set to True and a configuration object should be
|
||||
provided as `config` argument. This loading option is slower than converting the TensorFlow
|
||||
checkpoint in a PyTorch model using the provided conversion scripts and loading
|
||||
the PyTorch model afterwards.
|
||||
**model_args**: (`optional`) Sequence:
|
||||
All remaning positional arguments will be passed to the underlying model's __init__ function
|
||||
**config**: an optional configuration for the model to use instead of an automatically loaded configuation.
|
||||
Configuration can be automatically loaded when:
|
||||
- the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
|
||||
- the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
|
||||
**state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
|
||||
from saved weights file.
|
||||
This option can be used if you want to create a model from a pretrained configuraton but load your own weights.
|
||||
In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
|
||||
a simpler option.
|
||||
**cache_dir**: (`optional`) string:
|
||||
Parameters:
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
|
||||
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
|
||||
|
||||
model_args: (`optional`) Sequence of positional arguments:
|
||||
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
|
||||
|
||||
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
|
||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
|
||||
|
||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
|
||||
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
|
||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
|
||||
|
||||
state_dict: (`optional`) dict:
|
||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
|
||||
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
|
||||
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
|
||||
|
||||
cache_dir: (`optional`) string:
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
**output_loading_info**: (`optional`) boolean:
|
||||
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||
**kwargs**: (`optional`) dict:
|
||||
Dictionary of key, values to update the configuration object after loading.
|
||||
Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
|
||||
|
||||
- If a configuration is provided with `config`, **kwargs will be directly passed
|
||||
to the underlying model's __init__ method.
|
||||
- If a configuration is not provided, **kwargs will be first passed to the pretrained
|
||||
model configuration class loading function (`PretrainedConfig.from_pretrained`).
|
||||
Each key of **kwargs that corresponds to a configuration attribute
|
||||
will be used to override said attribute with the supplied **kwargs value.
|
||||
Remaining keys that do not correspond to any configuration attribute will
|
||||
be passed to the underlying model's __init__ function.
|
||||
output_loading_info: (`optional`) boolean:
|
||||
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
|
||||
|
||||
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
|
||||
|
||||
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
|
||||
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
|
||||
|
||||
Examples::
|
||||
|
||||
>>> model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||
>>> model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||
>>> model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||
>>> assert model.config.output_attention == True
|
||||
>>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||
>>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
|
||||
>>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
|
||||
model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
|
||||
model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
|
||||
assert model.config.output_attention == True
|
||||
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
|
||||
config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
|
||||
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
|
||||
|
||||
"""
|
||||
config = kwargs.pop('config', None)
|
||||
@@ -760,7 +779,7 @@ class SequenceSummary(nn.Module):
|
||||
- 'last' => [default] take the last token hidden state (like XLNet)
|
||||
- 'first' => take the first token hidden state (like Bert)
|
||||
- 'mean' => take the mean of all tokens hidden states
|
||||
- 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
|
||||
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
|
||||
- 'attn' => Not implemented now, use multi-head attention
|
||||
summary_use_proj: Add a projection after the vector extraction
|
||||
summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
|
||||
@@ -772,7 +791,7 @@ class SequenceSummary(nn.Module):
|
||||
super(SequenceSummary, self).__init__()
|
||||
|
||||
self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
|
||||
if config.summary_type == 'attn':
|
||||
if self.summary_type == 'attn':
|
||||
# We should use a standard multi-head attention module with absolute positional embedding for that.
|
||||
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
|
||||
# We can probably just use the multi-head attention module of PyTorch >=1.1.0
|
||||
@@ -798,11 +817,11 @@ class SequenceSummary(nn.Module):
|
||||
if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
|
||||
self.last_dropout = nn.Dropout(config.summary_last_dropout)
|
||||
|
||||
def forward(self, hidden_states, token_ids=None):
|
||||
def forward(self, hidden_states, cls_index=None):
|
||||
""" hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
|
||||
token_ids: [optional] index of the classification token if summary_type == 'token_ids',
|
||||
cls_index: [optional] position of the classification token if summary_type == 'cls_index',
|
||||
shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
|
||||
if summary_type == 'token_ids' and token_ids is None:
|
||||
if summary_type == 'cls_index' and cls_index is None:
|
||||
we take the last token of the sequence as classification token
|
||||
"""
|
||||
if self.summary_type == 'last':
|
||||
@@ -811,14 +830,14 @@ class SequenceSummary(nn.Module):
|
||||
output = hidden_states[:, 0]
|
||||
elif self.summary_type == 'mean':
|
||||
output = hidden_states.mean(dim=1)
|
||||
elif self.summary_type == 'token_ids':
|
||||
if token_ids is None:
|
||||
token_ids = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
|
||||
elif self.summary_type == 'cls_index':
|
||||
if cls_index is None:
|
||||
cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
|
||||
else:
|
||||
token_ids = token_ids.unsqueeze(-1).unsqueeze(-1)
|
||||
token_ids = token_ids.expand((-1,) * (token_ids.dim()-1) + (hidden_states.size(-1),))
|
||||
# shape of token_ids: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
|
||||
output = hidden_states.gather(-2, token_ids).squeeze(-2) # shape (bsz, XX, hidden_size)
|
||||
cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
|
||||
cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
|
||||
# shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
|
||||
output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
|
||||
elif self.summary_type == 'attn':
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -472,12 +472,11 @@ class XLMModel(XLMPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
|
||||
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
|
||||
>>> model = XLMModel(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
|
||||
model = XLMModel.from_pretrained('xlm-mlm-en-2048')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
|
||||
"""
|
||||
ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output',
|
||||
@@ -745,12 +744,11 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
|
||||
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
|
||||
>>> model = XLMWithLMHeadModel(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
|
||||
model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -805,14 +803,12 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
|
||||
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
|
||||
>>>
|
||||
>>> model = XLMForSequenceClassification(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids, labels=labels)
|
||||
>>> loss, logits = outputs[:2]
|
||||
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
|
||||
model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, logits = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -885,15 +881,13 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
|
||||
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
|
||||
>>>
|
||||
>>> model = XLMForQuestionAnswering(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> start_positions = torch.tensor([1])
|
||||
>>> end_positions = torch.tensor([3])
|
||||
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||
>>> loss, start_scores, end_scores = outputs[:2]
|
||||
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
|
||||
model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
start_positions = torch.tensor([1])
|
||||
end_positions = torch.tensor([3])
|
||||
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||
loss, start_scores, end_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
|
||||
@@ -335,7 +335,7 @@ class XLNetConfig(PretrainedConfig):
|
||||
|
||||
try:
|
||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
|
||||
except ImportError:
|
||||
except (ImportError, AttributeError) as e:
|
||||
logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
|
||||
class XLNetLayerNorm(nn.Module):
|
||||
def __init__(self, d_model, eps=1e-12):
|
||||
@@ -712,12 +712,11 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
|
||||
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||
>>> model = XLNetModel(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids)
|
||||
>>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||
model = XLNetModel.from_pretrained('xlnet-large-cased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids)
|
||||
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -1019,17 +1018,16 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
|
||||
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||
>>> model = XLNetLMHeadModel(config)
|
||||
>>> # We show how to setup inputs to predict a next token using a bi-directional context.
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0) # We will predict the masked token
|
||||
>>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
|
||||
>>> perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
|
||||
>>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token
|
||||
>>> target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
|
||||
>>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
|
||||
>>> next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
|
||||
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
|
||||
# We show how to setup inputs to predict a next token using a bi-directional context.
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0) # We will predict the masked token
|
||||
perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
|
||||
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
|
||||
target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token
|
||||
target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
|
||||
outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
|
||||
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -1100,14 +1098,12 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
|
||||
>>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||
>>>
|
||||
>>> model = XLNetForSequenceClassification(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
>>> outputs = model(input_ids, labels=labels)
|
||||
>>> loss, logits = outputs[:2]
|
||||
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
|
||||
model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, labels=labels)
|
||||
loss, logits = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
@@ -1200,15 +1196,13 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
||||
|
||||
Examples::
|
||||
|
||||
>>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
|
||||
>>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
|
||||
>>>
|
||||
>>> model = XLMForQuestionAnswering(config)
|
||||
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
>>> start_positions = torch.tensor([1])
|
||||
>>> end_positions = torch.tensor([3])
|
||||
>>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||
>>> loss, start_scores, end_scores = outputs[:2]
|
||||
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
|
||||
model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
|
||||
start_positions = torch.tensor([1])
|
||||
end_positions = torch.tensor([3])
|
||||
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
|
||||
loss, start_scores, end_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
def __init__(self, config):
|
||||
|
||||
47
pytorch_transformers/tests/modeling_auto_test.py
Normal file
47
pytorch_transformers/tests/modeling_auto_test.py
Normal file
@@ -0,0 +1,47 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import logging
|
||||
|
||||
from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel
|
||||
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
||||
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
|
||||
|
||||
|
||||
class AutoModelTest(unittest.TestCase):
|
||||
def test_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
config = AutoConfig.from_pretrained(model_name)
|
||||
self.assertIsNotNone(config)
|
||||
self.assertIsInstance(config, BertConfig)
|
||||
|
||||
model = AutoModel.from_pretrained(model_name)
|
||||
model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, BertModel)
|
||||
for value in loading_info.values():
|
||||
self.assertEqual(len(value), 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
46
pytorch_transformers/tests/tokenization_auto_test.py
Normal file
46
pytorch_transformers/tests/tokenization_auto_test.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import shutil
|
||||
import pytest
|
||||
import logging
|
||||
|
||||
from pytorch_transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
|
||||
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
from pytorch_transformers.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
||||
|
||||
class AutoTokenizerTest(unittest.TestCase):
|
||||
def test_tokenizer_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
self.assertIsNotNone(tokenizer)
|
||||
self.assertIsInstance(tokenizer, BertTokenizer)
|
||||
self.assertGreater(len(tokenizer), 0)
|
||||
|
||||
for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
self.assertIsNotNone(tokenizer)
|
||||
self.assertIsInstance(tokenizer, GPT2Tokenizer)
|
||||
self.assertGreater(len(tokenizer), 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -24,30 +24,37 @@ from pytorch_transformers.tokenization_bert import (BasicTokenizer,
|
||||
_is_control, _is_punctuation,
|
||||
_is_whitespace, VOCAB_FILES_NAMES)
|
||||
|
||||
from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
|
||||
class TokenizationTest(unittest.TestCase):
|
||||
class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
|
||||
tokenizer_class = BertTokenizer
|
||||
|
||||
def setUp(self):
|
||||
super(BertTokenizationTest, self).setUp()
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
vocab_tokens = [
|
||||
"[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
|
||||
"##ing", ",", "low", "lowest",
|
||||
]
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||
with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||
with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
|
||||
input_text = u"UNwant\u00E9d,running"
|
||||
output_text = u"unwanted, running"
|
||||
def get_tokenizer(self):
|
||||
return BertTokenizer.from_pretrained(self.tmpdirname)
|
||||
|
||||
create_and_check_tokenizer_commons(self, input_text, output_text, BertTokenizer, tmpdirname)
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"UNwant\u00E9d,running"
|
||||
output_text = u"unwanted, running"
|
||||
return input_text, output_text
|
||||
|
||||
tokenizer = BertTokenizer(vocab_file)
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = BertTokenizer(self.vocab_file)
|
||||
|
||||
tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
|
||||
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
|
||||
tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
|
||||
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
|
||||
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
|
||||
|
||||
def test_chinese(self):
|
||||
tokenizer = BasicTokenizer()
|
||||
|
||||
@@ -20,42 +20,49 @@ import json
|
||||
|
||||
from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
|
||||
|
||||
from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
|
||||
class GPT2TokenizationTest(unittest.TestCase):
|
||||
class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
|
||||
tokenizer_class = GPT2Tokenizer
|
||||
|
||||
def setUp(self):
|
||||
super(GPT2TokenizationTest, self).setUp()
|
||||
|
||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
||||
"lo", "low", "er",
|
||||
"low", "lowest", "newer", "wider", "<unk>"]
|
||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||
merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
|
||||
special_tokens_map = {"unk_token": "<unk>"}
|
||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
||||
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||
merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
||||
with open(vocab_file, "w") as fp:
|
||||
fp.write(json.dumps(vocab_tokens))
|
||||
with open(merges_file, "w") as fp:
|
||||
fp.write("\n".join(merges))
|
||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
||||
with open(self.vocab_file, "w") as fp:
|
||||
fp.write(json.dumps(vocab_tokens))
|
||||
with open(self.merges_file, "w") as fp:
|
||||
fp.write("\n".join(merges))
|
||||
|
||||
input_text = u"lower newer"
|
||||
output_text = u"lower<unk>newer"
|
||||
def get_tokenizer(self):
|
||||
return GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
|
||||
|
||||
create_and_check_tokenizer_commons(self, input_text, output_text, GPT2Tokenizer, tmpdirname, **special_tokens_map)
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"lower newer"
|
||||
output_text = u"lower<unk>newer"
|
||||
return input_text, output_text
|
||||
|
||||
tokenizer = GPT2Tokenizer(vocab_file, merges_file, **special_tokens_map)
|
||||
text = "lower"
|
||||
bpe_tokens = ["low", "er"]
|
||||
tokens = tokenizer.tokenize(text)
|
||||
self.assertListEqual(tokens, bpe_tokens)
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
||||
text = "lower"
|
||||
bpe_tokens = ["low", "er"]
|
||||
tokens = tokenizer.tokenize(text)
|
||||
self.assertListEqual(tokens, bpe_tokens)
|
||||
|
||||
input_tokens = tokens + [tokenizer.unk_token]
|
||||
input_bpe_tokens = [13, 12, 17]
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||
input_tokens = tokens + [tokenizer.unk_token]
|
||||
input_bpe_tokens = [13, 12, 17]
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -20,13 +20,17 @@ import json
|
||||
|
||||
from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
|
||||
|
||||
from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
|
||||
|
||||
class OpenAIGPTTokenizationTest(unittest.TestCase):
|
||||
class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
|
||||
tokenizer_class = OpenAIGPTTokenizer
|
||||
|
||||
def setUp(self):
|
||||
super(OpenAIGPTTokenizationTest, self).setUp()
|
||||
|
||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
||||
"w</w>", "r</w>", "t</w>",
|
||||
"lo", "low", "er</w>",
|
||||
@@ -34,30 +38,34 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
|
||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||
merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
|
||||
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||
merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
||||
with open(vocab_file, "w") as fp:
|
||||
fp.write(json.dumps(vocab_tokens))
|
||||
with open(merges_file, "w") as fp:
|
||||
fp.write("\n".join(merges))
|
||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
||||
with open(self.vocab_file, "w") as fp:
|
||||
fp.write(json.dumps(vocab_tokens))
|
||||
with open(self.merges_file, "w") as fp:
|
||||
fp.write("\n".join(merges))
|
||||
|
||||
input_text = u"lower newer"
|
||||
output_text = u"lower newer"
|
||||
def get_tokenizer(self):
|
||||
return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname)
|
||||
|
||||
create_and_check_tokenizer_commons(self, input_text, output_text, OpenAIGPTTokenizer, tmpdirname)
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"lower newer"
|
||||
output_text = u"lower newer"
|
||||
return input_text, output_text
|
||||
|
||||
tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file)
|
||||
|
||||
text = "lower"
|
||||
bpe_tokens = ["low", "er</w>"]
|
||||
tokens = tokenizer.tokenize(text)
|
||||
self.assertListEqual(tokens, bpe_tokens)
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
|
||||
|
||||
input_tokens = tokens + ["<unk>"]
|
||||
input_bpe_tokens = [14, 15, 20]
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||
text = "lower"
|
||||
bpe_tokens = ["low", "er</w>"]
|
||||
tokens = tokenizer.tokenize(text)
|
||||
self.assertListEqual(tokens, bpe_tokens)
|
||||
|
||||
input_tokens = tokens + ["<unk>"]
|
||||
input_bpe_tokens = [14, 15, 20]
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -19,6 +19,7 @@ import sys
|
||||
from io import open
|
||||
import tempfile
|
||||
import shutil
|
||||
import unittest
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
@@ -36,113 +37,124 @@ else:
|
||||
unicode = str
|
||||
|
||||
|
||||
def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
|
||||
tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
|
||||
class CommonTestCases:
|
||||
|
||||
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
||||
class CommonTokenizerTester(unittest.TestCase):
|
||||
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
tokenizer.save_pretrained(tmpdirname)
|
||||
tokenizer = tokenizer.from_pretrained(tmpdirname)
|
||||
tokenizer_class = None
|
||||
|
||||
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
||||
tester.assertListEqual(before_tokens, after_tokens)
|
||||
def setUp(self):
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
|
||||
tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
|
||||
tester.assertIsNotNone(tokenizer)
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdirname)
|
||||
|
||||
text = u"Munich and Berlin are nice cities"
|
||||
subwords = tokenizer.tokenize(text)
|
||||
def get_tokenizer(self):
|
||||
raise NotImplementedError
|
||||
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
def get_input_output_texts(self):
|
||||
raise NotImplementedError
|
||||
|
||||
filename = os.path.join(tmpdirname, u"tokenizer.bin")
|
||||
pickle.dump(tokenizer, open(filename, "wb"))
|
||||
def test_save_and_load_tokenizer(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
tokenizer_new = pickle.load(open(filename, "rb"))
|
||||
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
||||
|
||||
subwords_loaded = tokenizer_new.tokenize(text)
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
tokenizer.save_pretrained(tmpdirname)
|
||||
tokenizer = tokenizer.from_pretrained(tmpdirname)
|
||||
|
||||
tester.assertListEqual(subwords, subwords_loaded)
|
||||
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
|
||||
self.assertListEqual(before_tokens, after_tokens)
|
||||
|
||||
def test_pickle_tokenizer(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
self.assertIsNotNone(tokenizer)
|
||||
|
||||
text = u"Munich and Berlin are nice cities"
|
||||
subwords = tokenizer.tokenize(text)
|
||||
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
|
||||
filename = os.path.join(tmpdirname, u"tokenizer.bin")
|
||||
pickle.dump(tokenizer, open(filename, "wb"))
|
||||
|
||||
tokenizer_new = pickle.load(open(filename, "rb"))
|
||||
|
||||
subwords_loaded = tokenizer_new.tokenize(text)
|
||||
|
||||
self.assertListEqual(subwords, subwords_loaded)
|
||||
|
||||
|
||||
def create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
|
||||
tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
|
||||
def test_add_tokens_tokenizer(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
vocab_size = tokenizer.vocab_size
|
||||
all_size = len(tokenizer)
|
||||
vocab_size = tokenizer.vocab_size
|
||||
all_size = len(tokenizer)
|
||||
|
||||
tester.assertNotEqual(vocab_size, 0)
|
||||
tester.assertEqual(vocab_size, all_size)
|
||||
self.assertNotEqual(vocab_size, 0)
|
||||
self.assertEqual(vocab_size, all_size)
|
||||
|
||||
new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
|
||||
added_toks = tokenizer.add_tokens(new_toks)
|
||||
vocab_size_2 = tokenizer.vocab_size
|
||||
all_size_2 = len(tokenizer)
|
||||
new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
|
||||
added_toks = tokenizer.add_tokens(new_toks)
|
||||
vocab_size_2 = tokenizer.vocab_size
|
||||
all_size_2 = len(tokenizer)
|
||||
|
||||
tester.assertNotEqual(vocab_size_2, 0)
|
||||
tester.assertEqual(vocab_size, vocab_size_2)
|
||||
tester.assertEqual(added_toks, len(new_toks))
|
||||
tester.assertEqual(all_size_2, all_size + len(new_toks))
|
||||
self.assertNotEqual(vocab_size_2, 0)
|
||||
self.assertEqual(vocab_size, vocab_size_2)
|
||||
self.assertEqual(added_toks, len(new_toks))
|
||||
self.assertEqual(all_size_2, all_size + len(new_toks))
|
||||
|
||||
tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
|
||||
tester.assertGreaterEqual(len(tokens), 4)
|
||||
tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
||||
tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
||||
tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
|
||||
self.assertGreaterEqual(len(tokens), 4)
|
||||
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
||||
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
||||
|
||||
new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
|
||||
'pad_token': "<<<<<|||>|>>>>|>"}
|
||||
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
|
||||
vocab_size_3 = tokenizer.vocab_size
|
||||
all_size_3 = len(tokenizer)
|
||||
new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
|
||||
'pad_token': "<<<<<|||>|>>>>|>"}
|
||||
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
|
||||
vocab_size_3 = tokenizer.vocab_size
|
||||
all_size_3 = len(tokenizer)
|
||||
|
||||
tester.assertNotEqual(vocab_size_3, 0)
|
||||
tester.assertEqual(vocab_size, vocab_size_3)
|
||||
tester.assertEqual(added_toks_2, len(new_toks_2))
|
||||
tester.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
|
||||
self.assertNotEqual(vocab_size_3, 0)
|
||||
self.assertEqual(vocab_size, vocab_size_3)
|
||||
self.assertEqual(added_toks_2, len(new_toks_2))
|
||||
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
|
||||
|
||||
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
|
||||
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
|
||||
|
||||
tester.assertGreaterEqual(len(tokens), 6)
|
||||
tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
||||
tester.assertGreater(tokens[0], tokens[1])
|
||||
tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
||||
tester.assertGreater(tokens[-2], tokens[-3])
|
||||
tester.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
|
||||
tester.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
|
||||
self.assertGreaterEqual(len(tokens), 6)
|
||||
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
||||
self.assertGreater(tokens[0], tokens[1])
|
||||
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
||||
self.assertGreater(tokens[-2], tokens[-3])
|
||||
self.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
|
||||
self.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
|
||||
|
||||
|
||||
def create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
|
||||
tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
|
||||
def test_required_methods_tokenizer(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
input_text, output_text = self.get_input_output_texts()
|
||||
|
||||
tokens = tokenizer.tokenize(input_text)
|
||||
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
ids_2 = tokenizer.encode(input_text)
|
||||
tester.assertListEqual(ids, ids_2)
|
||||
tokens = tokenizer.tokenize(input_text)
|
||||
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
ids_2 = tokenizer.encode(input_text)
|
||||
self.assertListEqual(ids, ids_2)
|
||||
|
||||
tokens_2 = tokenizer.convert_ids_to_tokens(ids)
|
||||
text_2 = tokenizer.decode(ids)
|
||||
tokens_2 = tokenizer.convert_ids_to_tokens(ids)
|
||||
text_2 = tokenizer.decode(ids)
|
||||
|
||||
tester.assertEqual(text_2, output_text)
|
||||
self.assertEqual(text_2, output_text)
|
||||
|
||||
tester.assertNotEqual(len(tokens_2), 0)
|
||||
tester.assertIsInstance(text_2, (str, unicode))
|
||||
self.assertNotEqual(len(tokens_2), 0)
|
||||
self.assertIsInstance(text_2, (str, unicode))
|
||||
|
||||
|
||||
def create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
|
||||
weights_list = list(tokenizer_class.max_model_input_sizes.keys())
|
||||
weights_lists_2 = []
|
||||
for file_id, map_list in tokenizer_class.pretrained_vocab_files_map.items():
|
||||
weights_lists_2.append(list(map_list.keys()))
|
||||
def test_pretrained_model_lists(self):
|
||||
weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
|
||||
weights_lists_2 = []
|
||||
for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
|
||||
weights_lists_2.append(list(map_list.keys()))
|
||||
|
||||
for weights_list_2 in weights_lists_2:
|
||||
tester.assertListEqual(weights_list, weights_list_2)
|
||||
|
||||
|
||||
def create_and_check_tokenizer_commons(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
|
||||
create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
|
||||
create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
|
||||
create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
|
||||
create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
|
||||
create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
|
||||
for weights_list_2 in weights_lists_2:
|
||||
self.assertListEqual(weights_list, weights_list_2)
|
||||
|
||||
@@ -20,32 +20,39 @@ from io import open
|
||||
|
||||
from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
|
||||
|
||||
from.tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
|
||||
from.tokenization_tests_commons import CommonTestCases
|
||||
|
||||
class TransfoXLTokenizationTest(unittest.TestCase):
|
||||
class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
|
||||
tokenizer_class = TransfoXLTokenizer
|
||||
|
||||
def setUp(self):
|
||||
super(TransfoXLTokenizationTest, self).setUp()
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
vocab_tokens = [
|
||||
"<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
|
||||
"running", ",", "low", "l",
|
||||
]
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||
with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||
with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
|
||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||
|
||||
input_text = u"<unk> UNwanted , running"
|
||||
output_text = u"<unk> unwanted, running"
|
||||
def get_tokenizer(self):
|
||||
return TransfoXLTokenizer.from_pretrained(self.tmpdirname, lower_case=True)
|
||||
|
||||
create_and_check_tokenizer_commons(self, input_text, output_text, TransfoXLTokenizer, tmpdirname, lower_case=True)
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"<unk> UNwanted , running"
|
||||
output_text = u"<unk> unwanted, running"
|
||||
return input_text, output_text
|
||||
|
||||
tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
|
||||
|
||||
tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
|
||||
self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
|
||||
tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
|
||||
self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
|
||||
|
||||
def test_full_tokenizer_lower(self):
|
||||
tokenizer = TransfoXLTokenizer(lower_case=True)
|
||||
|
||||
@@ -20,12 +20,16 @@ import json
|
||||
|
||||
from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
|
||||
|
||||
from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
|
||||
class XLMTokenizationTest(unittest.TestCase):
|
||||
class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
|
||||
tokenizer_class = XLMTokenizer
|
||||
|
||||
def setUp(self):
|
||||
super(XLMTokenizationTest, self).setUp()
|
||||
|
||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
|
||||
"w</w>", "r</w>", "t</w>",
|
||||
"lo", "low", "er</w>",
|
||||
@@ -33,30 +37,34 @@ class XLMTokenizationTest(unittest.TestCase):
|
||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
|
||||
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||
merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
||||
with open(vocab_file, "w") as fp:
|
||||
fp.write(json.dumps(vocab_tokens))
|
||||
with open(merges_file, "w") as fp:
|
||||
fp.write("\n".join(merges))
|
||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
|
||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
|
||||
with open(self.vocab_file, "w") as fp:
|
||||
fp.write(json.dumps(vocab_tokens))
|
||||
with open(self.merges_file, "w") as fp:
|
||||
fp.write("\n".join(merges))
|
||||
|
||||
input_text = u"lower newer"
|
||||
output_text = u"lower newer"
|
||||
def get_tokenizer(self):
|
||||
return XLMTokenizer.from_pretrained(self.tmpdirname)
|
||||
|
||||
create_and_check_tokenizer_commons(self, input_text, output_text, XLMTokenizer, tmpdirname)
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"lower newer"
|
||||
output_text = u"lower newer"
|
||||
return input_text, output_text
|
||||
|
||||
tokenizer = XLMTokenizer(vocab_file, merges_file)
|
||||
def test_full_tokenizer(self):
|
||||
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
|
||||
tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
|
||||
|
||||
text = "lower"
|
||||
bpe_tokens = ["low", "er</w>"]
|
||||
tokens = tokenizer.tokenize(text)
|
||||
self.assertListEqual(tokens, bpe_tokens)
|
||||
text = "lower"
|
||||
bpe_tokens = ["low", "er</w>"]
|
||||
tokens = tokenizer.tokenize(text)
|
||||
self.assertListEqual(tokens, bpe_tokens)
|
||||
|
||||
input_tokens = tokens + ["<unk>"]
|
||||
input_bpe_tokens = [14, 15, 20]
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||
input_tokens = tokens + ["<unk>"]
|
||||
input_bpe_tokens = [14, 15, 20]
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -19,48 +19,58 @@ import unittest
|
||||
|
||||
from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
|
||||
|
||||
from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
|
||||
from .tokenization_tests_commons import CommonTestCases
|
||||
|
||||
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'fixtures/test_sentencepiece.model')
|
||||
|
||||
class XLNetTokenizationTest(unittest.TestCase):
|
||||
class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
||||
|
||||
tokenizer_class = XLNetTokenizer
|
||||
|
||||
def setUp(self):
|
||||
super(XLNetTokenizationTest, self).setUp()
|
||||
|
||||
# We have a SentencePiece fixture for testing
|
||||
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||
tokenizer.save_pretrained(self.tmpdirname)
|
||||
|
||||
def get_tokenizer(self):
|
||||
return XLNetTokenizer.from_pretrained(self.tmpdirname)
|
||||
|
||||
def get_input_output_texts(self):
|
||||
input_text = u"This is a test"
|
||||
output_text = u"This is a test"
|
||||
return input_text, output_text
|
||||
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||
|
||||
with TemporaryDirectory() as tmpdirname:
|
||||
tokenizer.save_pretrained(tmpdirname)
|
||||
tokens = tokenizer.tokenize(u'This is a test')
|
||||
self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
|
||||
|
||||
input_text = u"This is a test"
|
||||
output_text = u"This is a test"
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
|
||||
|
||||
create_and_check_tokenizer_commons(self, input_text, output_text, XLNetTokenizer, tmpdirname)
|
||||
tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
|
||||
self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
|
||||
u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
|
||||
u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
|
||||
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
|
||||
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
self.assertListEqual(
|
||||
ids, [8, 21, 84, 55, 24, 19, 7, 0,
|
||||
602, 347, 347, 347, 3, 12, 66,
|
||||
46, 72, 80, 6, 0, 4])
|
||||
|
||||
tokens = tokenizer.tokenize(u'This is a test')
|
||||
self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
|
||||
|
||||
self.assertListEqual(
|
||||
tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
|
||||
|
||||
tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
|
||||
self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
|
||||
u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
|
||||
u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
|
||||
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
|
||||
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
self.assertListEqual(
|
||||
ids, [8, 21, 84, 55, 24, 19, 7, 0,
|
||||
602, 347, 347, 347, 3, 12, 66,
|
||||
46, 72, 80, 6, 0, 4])
|
||||
|
||||
back_tokens = tokenizer.convert_ids_to_tokens(ids)
|
||||
self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
|
||||
u'or', u'n', SPIECE_UNDERLINE + u'in',
|
||||
SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
|
||||
SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
|
||||
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
|
||||
u'<unk>', u'.'])
|
||||
back_tokens = tokenizer.convert_ids_to_tokens(ids)
|
||||
self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
|
||||
u'or', u'n', SPIECE_UNDERLINE + u'in',
|
||||
SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
|
||||
SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
|
||||
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
|
||||
u'<unk>', u'.'])
|
||||
|
||||
def test_tokenizer_lower(self):
|
||||
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
|
||||
|
||||
100
pytorch_transformers/tokenization_auto.py
Normal file
100
pytorch_transformers/tokenization_auto.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Auto Model class. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
from .tokenization_bert import BertTokenizer
|
||||
from .tokenization_openai import OpenAIGPTTokenizer
|
||||
from .tokenization_gpt2 import GPT2Tokenizer
|
||||
from .tokenization_transfo_xl import TransfoXLTokenizer
|
||||
from .tokenization_xlnet import XLNetTokenizer
|
||||
from .tokenization_xlm import XLMTokenizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class AutoTokenizer(object):
|
||||
r""":class:`~pytorch_transformers.AutoTokenizer` is a generic tokenizer class
|
||||
that will be instantiated as one of the tokenizer classes of the library
|
||||
when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
|
||||
class method.
|
||||
|
||||
The `from_pretrained()` method take care of returning the correct tokenizer class instance
|
||||
using pattern matching on the `pretrained_model_name_or_path` string.
|
||||
|
||||
The tokenizer class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `bert`: BertTokenizer (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetTokenizer (XLNet model)
|
||||
- contains `xlm`: XLMTokenizer (XLM model)
|
||||
|
||||
This class cannot be instantiated using `__init__()` (throw an error).
|
||||
"""
|
||||
def __init__(self):
|
||||
raise EnvironmentError("AutoTokenizer is designed to be instantiated "
|
||||
"using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.")
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
||||
r""" Instantiate a one of the tokenizer classes of the library
|
||||
from a pre-trained model vocabulary.
|
||||
|
||||
The tokenizer class to instantiate is selected as the first pattern matching
|
||||
in the `pretrained_model_name_or_path` string (in the following order):
|
||||
- contains `bert`: BertTokenizer (Bert model)
|
||||
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
|
||||
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
|
||||
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
|
||||
- contains `xlnet`: XLNetTokenizer (XLNet model)
|
||||
- contains `xlm`: XLMTokenizer (XLM model)
|
||||
|
||||
Params:
|
||||
**pretrained_model_name_or_path**: either:
|
||||
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
|
||||
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
|
||||
- a path to a `directory` containing a configuration file saved
|
||||
using the `save_pretrained(save_directory)` method.
|
||||
- a path or url to a saved configuration `file`.
|
||||
**cache_dir**: (`optional`) string:
|
||||
Path to a directory in which a downloaded pre-trained model
|
||||
configuration should be cached if the standard cache should not be used.
|
||||
|
||||
Examples::
|
||||
|
||||
config = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache.
|
||||
config = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
|
||||
|
||||
"""
|
||||
if 'bert' in pretrained_model_name_or_path:
|
||||
return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'openai-gpt' in pretrained_model_name_or_path:
|
||||
return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'gpt2' in pretrained_model_name_or_path:
|
||||
return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'transfo-xl' in pretrained_model_name_or_path:
|
||||
return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'xlnet' in pretrained_model_name_or_path:
|
||||
return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
elif 'xlm' in pretrained_model_name_or_path:
|
||||
return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
|
||||
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
|
||||
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
|
||||
"'xlm'".format(pretrained_model_name_or_path))
|
||||
@@ -22,7 +22,7 @@ import os
|
||||
import unicodedata
|
||||
from io import open
|
||||
|
||||
from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -86,7 +86,7 @@ def whitespace_tokenize(text):
|
||||
class BertTokenizer(PreTrainedTokenizer):
|
||||
r"""
|
||||
Constructs a BertTokenizer.
|
||||
:class:`~pytorch_pretrained_bert.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
|
||||
:class:`~pytorch_transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
|
||||
|
||||
Args:
|
||||
vocab_file: Path to a one-wordpiece-per-line vocabulary file
|
||||
@@ -119,7 +119,7 @@ class BertTokenizer(PreTrainedTokenizer):
|
||||
Only has an effect when do_basic_tokenize=True
|
||||
**tokenize_chinese_chars**: (`optional`) boolean (default True)
|
||||
Whether to tokenize Chinese characters.
|
||||
This should likely be desactivated for Japanese:
|
||||
This should likely be deactivated for Japanese:
|
||||
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
|
||||
"""
|
||||
super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
|
||||
@@ -214,7 +214,7 @@ class BasicTokenizer(object):
|
||||
List of token not to split.
|
||||
**tokenize_chinese_chars**: (`optional`) boolean (default True)
|
||||
Whether to tokenize Chinese characters.
|
||||
This should likely be desactivated for Japanese:
|
||||
This should likely be deactivated for Japanese:
|
||||
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
|
||||
"""
|
||||
if never_split is None:
|
||||
|
||||
@@ -31,7 +31,7 @@ except ImportError:
|
||||
def lru_cache():
|
||||
return lambda func: func
|
||||
|
||||
from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -102,9 +102,9 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
|
||||
def __init__(self, vocab_file, merges_file, errors='replace',
|
||||
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
|
||||
bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
|
||||
super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, **kwargs)
|
||||
super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
|
||||
|
||||
self.encoder = json.load(open(vocab_file))
|
||||
self.decoder = {v:k for k,v in self.encoder.items()}
|
||||
@@ -177,9 +177,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
if token in self.encoder:
|
||||
return self.encoder.get(token)
|
||||
return self.encoder.get(self.unk_token)
|
||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
||||
|
||||
@@ -30,7 +30,7 @@ import torch
|
||||
import numpy as np
|
||||
|
||||
from .file_utils import cached_path
|
||||
from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
|
||||
@@ -30,14 +30,34 @@ SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
|
||||
ADDED_TOKENS_FILE = 'added_tokens.json'
|
||||
|
||||
class PreTrainedTokenizer(object):
|
||||
""" An abstract class to handle dowloading and loading pretrained tokenizers and adding tokens to the vocabulary.
|
||||
""" Base class for all tokenizers.
|
||||
Handle all the shared methods for tokenization and special tokens as well as methods dowloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
|
||||
|
||||
Derived class can set up a few special tokens to be used in common scripts and internals:
|
||||
bos_token, eos_token, EOP_TOKEN, EOD_TOKEN, unk_token, sep_token, pad_token, cls_token, mask_token
|
||||
additional_special_tokens = []
|
||||
This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
|
||||
|
||||
We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the
|
||||
specific vocabulary augmentation methods of the various underlying dictionnary structures (BPE, sentencepiece...).
|
||||
Class attributes (overridden by derived classes):
|
||||
|
||||
- ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
|
||||
- ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
|
||||
- ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
|
||||
|
||||
Parameters:
|
||||
|
||||
- ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token``
|
||||
|
||||
- ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token``
|
||||
|
||||
- ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token``
|
||||
|
||||
- ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token``
|
||||
|
||||
- ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token``
|
||||
|
||||
- ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token``
|
||||
|
||||
- ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token``
|
||||
|
||||
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens``
|
||||
"""
|
||||
vocab_files_names = {}
|
||||
pretrained_vocab_files_map = {}
|
||||
@@ -49,48 +69,56 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
@property
|
||||
def bos_token(self):
|
||||
""" Beginning of sentence token (string). Log an error if used while not having been set. """
|
||||
if self._bos_token is None:
|
||||
logger.error("Using bos_token, but it is not set yet.")
|
||||
return self._bos_token
|
||||
|
||||
@property
|
||||
def eos_token(self):
|
||||
""" End of sentence token (string). Log an error if used while not having been set. """
|
||||
if self._eos_token is None:
|
||||
logger.error("Using eos_token, but it is not set yet.")
|
||||
return self._eos_token
|
||||
|
||||
@property
|
||||
def unk_token(self):
|
||||
""" Unknown token (string). Log an error if used while not having been set. """
|
||||
if self._unk_token is None:
|
||||
logger.error("Using unk_token, but it is not set yet.")
|
||||
return self._unk_token
|
||||
|
||||
@property
|
||||
def sep_token(self):
|
||||
""" Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
|
||||
if self._sep_token is None:
|
||||
logger.error("Using sep_token, but it is not set yet.")
|
||||
return self._sep_token
|
||||
|
||||
@property
|
||||
def pad_token(self):
|
||||
""" Padding token (string). Log an error if used while not having been set. """
|
||||
if self._pad_token is None:
|
||||
logger.error("Using pad_token, but it is not set yet.")
|
||||
return self._pad_token
|
||||
|
||||
@property
|
||||
def cls_token(self):
|
||||
""" Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
|
||||
if self._cls_token is None:
|
||||
logger.error("Using cls_token, but it is not set yet.")
|
||||
return self._cls_token
|
||||
|
||||
@property
|
||||
def mask_token(self):
|
||||
""" Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
|
||||
if self._mask_token is None:
|
||||
logger.error("Using mask_token, but it is not set yet.")
|
||||
return self._mask_token
|
||||
|
||||
@property
|
||||
def additional_special_tokens(self):
|
||||
""" All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
|
||||
if self._additional_special_tokens is None:
|
||||
logger.error("Using additional_special_tokens, but it is not set yet.")
|
||||
return self._additional_special_tokens
|
||||
@@ -143,20 +171,58 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
for key, value in kwargs.items():
|
||||
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
|
||||
if key == 'additional_special_tokens':
|
||||
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
|
||||
else:
|
||||
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
|
||||
setattr(self, key, value)
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *inputs, **kwargs):
|
||||
r""" Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
|
||||
|
||||
Parameters:
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
|
||||
|
||||
cache_dir: (`optional`) string:
|
||||
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
|
||||
|
||||
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
|
||||
|
||||
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
|
||||
|
||||
Examples::
|
||||
|
||||
# We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
|
||||
|
||||
# Download vocabulary from S3 and cache.
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
|
||||
# If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
|
||||
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
|
||||
|
||||
# If the tokenizer uses a single vocabulary file, you can point directly to this file
|
||||
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
|
||||
|
||||
# You can link tokens to special vocabulary when instantiating
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
|
||||
# You should be sure '<unk>' is in the vocabulary when doing that.
|
||||
# Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
|
||||
assert tokenizer.unk_token == '<unk>'
|
||||
|
||||
"""
|
||||
return cls._from_pretrained(*inputs, **kwargs)
|
||||
|
||||
|
||||
@classmethod
|
||||
def _from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
|
||||
"""
|
||||
Instantiate a PreTrainedTokenizer from pre-trained vocabulary files.
|
||||
Download and cache the vocabulary files if needed.
|
||||
"""
|
||||
def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
||||
cache_dir = kwargs.pop('cache_dir', None)
|
||||
|
||||
s3_models = list(cls.max_model_input_sizes.keys())
|
||||
vocab_files = {}
|
||||
if pretrained_model_name_or_path in s3_models:
|
||||
@@ -271,8 +337,9 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
def save_pretrained(self, save_directory):
|
||||
""" Save the tokenizer vocabulary files (with added tokens) and the
|
||||
special-tokens-to-class-attributes-mapping to a directory, so that it
|
||||
can be re-loaded using the `from_pretrained(save_directory)` class method.
|
||||
special-tokens-to-class-attributes-mapping to a directory.
|
||||
|
||||
This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
|
||||
"""
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error("Saving directory ({}) should be a directory".format(save_directory))
|
||||
@@ -297,38 +364,52 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
|
||||
def save_vocabulary(self, save_directory):
|
||||
""" Save the tokenizer vocabulary to a directory. This method doesn't save added tokens
|
||||
""" Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
|
||||
and special token mappings.
|
||||
|
||||
Please use `save_pretrained()` to save the full Tokenizer state so that it can be
|
||||
reloaded using the `from_pretrained(save_directory)` class method.
|
||||
|
||||
Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def vocab_size(self):
|
||||
""" Size of the base vocabulary (without the added tokens) """
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def __len__(self):
|
||||
""" Size of the full vocabulary with the added tokens """
|
||||
return self.vocab_size + len(self.added_tokens_encoder)
|
||||
|
||||
|
||||
def add_tokens(self, new_tokens):
|
||||
""" Add a list of new tokens to the tokenizer class. If the new tokens are not in the
|
||||
vocabulary, they are added to the added_tokens_encoder with indices starting from
|
||||
the last index of the current vocabulary.
|
||||
vocabulary, they are added to it with indices starting from length of the current vocabulary.
|
||||
|
||||
Parameters:
|
||||
new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
|
||||
|
||||
Returns:
|
||||
Number of tokens added to the vocabulary which can be used to correspondingly
|
||||
increase the size of the associated model embedding matrices.
|
||||
Number of tokens added to the vocabulary.
|
||||
|
||||
Examples::
|
||||
|
||||
# Let's see how to increase the vocabulary of Bert model and tokenizer
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertModel.from_pretrained('bert-base-uncased')
|
||||
|
||||
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
|
||||
print('We have added', num_added_toks, 'tokens')
|
||||
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
|
||||
"""
|
||||
if not new_tokens:
|
||||
return 0
|
||||
|
||||
to_add_tokens = []
|
||||
for token in new_tokens:
|
||||
if self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
|
||||
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
|
||||
if token != self.unk_token and \
|
||||
self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
|
||||
to_add_tokens.append(token)
|
||||
logger.info("Adding %s to the vocabulary", token)
|
||||
|
||||
@@ -341,24 +422,48 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
|
||||
def add_special_tokens(self, special_tokens_dict):
|
||||
""" Add a dictionnary of special tokens (eos, pad, cls...) to the encoder and link them
|
||||
to class attributes. If the special tokens are not in the vocabulary, they are added
|
||||
to it and indexed starting from the last index of the current vocabulary.
|
||||
""" Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
|
||||
to class attributes. If special tokens are NOT in the vocabulary, they are added
|
||||
to it (indexed starting from the last index of the current vocabulary).
|
||||
|
||||
Parameters:
|
||||
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``].
|
||||
|
||||
Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
|
||||
|
||||
Returns:
|
||||
Number of tokens added to the vocabulary which can be used to correspondingly
|
||||
increase the size of the associated model embedding matrices.
|
||||
Number of tokens added to the vocabulary.
|
||||
|
||||
Examples::
|
||||
|
||||
# Let's see how to add a new classification token to GPT-2
|
||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
model = GPT2Model.from_pretrained('gpt2')
|
||||
|
||||
special_tokens_dict = {'cls_token': '<CLS>'}
|
||||
|
||||
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
|
||||
print('We have added', num_added_toks, 'tokens')
|
||||
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
|
||||
|
||||
assert tokenizer.cls_token == '<CLS>'
|
||||
"""
|
||||
if not special_tokens_dict:
|
||||
return 0
|
||||
|
||||
added_special_tokens = self.add_tokens(special_tokens_dict.values())
|
||||
added_tokens = 0
|
||||
for key, value in special_tokens_dict.items():
|
||||
assert key in self.SPECIAL_TOKENS_ATTRIBUTES
|
||||
if key == 'additional_special_tokens':
|
||||
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
|
||||
added_tokens += self.add_tokens(value)
|
||||
else:
|
||||
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
|
||||
added_tokens += self.add_tokens([value])
|
||||
logger.info("Assigning %s to the %s key of the tokenizer", value, key)
|
||||
setattr(self, key, value)
|
||||
|
||||
return added_special_tokens
|
||||
|
||||
return added_tokens
|
||||
|
||||
def tokenize(self, text, **kwargs):
|
||||
""" Converts a string in a sequence of tokens (string), using the tokenizer.
|
||||
@@ -386,13 +491,13 @@ class PreTrainedTokenizer(object):
|
||||
Split in words for word-based vocabulary or sub-words for sub-word-based
|
||||
vocabularies (BPE/SentencePieces/WordPieces).
|
||||
|
||||
Don't take care of added tokens.
|
||||
Do NOT take care of added tokens.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def convert_tokens_to_ids(self, tokens):
|
||||
""" Converts a single token or a sequence of tokens (str/unicode) in a integer id
|
||||
(resp.) a sequence of ids, using the vocabulary.
|
||||
""" Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
|
||||
(resp. a sequence of ids), using the vocabulary.
|
||||
"""
|
||||
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
|
||||
return self._convert_token_to_id_with_added_voc(tokens)
|
||||
@@ -417,7 +522,8 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
def encode(self, text):
|
||||
""" Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
|
||||
same as self.convert_tokens_to_ids(self.tokenize(text)).
|
||||
|
||||
Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
|
||||
"""
|
||||
return self.convert_tokens_to_ids(self.tokenize(text))
|
||||
|
||||
@@ -457,11 +563,13 @@ class PreTrainedTokenizer(object):
|
||||
def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
|
||||
""" Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
|
||||
with options to remove special tokens and clean up tokenization spaces.
|
||||
|
||||
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
|
||||
"""
|
||||
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
||||
text = self.convert_tokens_to_string(filtered_tokens)
|
||||
if clean_up_tokenization_spaces:
|
||||
text = clean_up_tokenization(text)
|
||||
text = self.clean_up_tokenization(text)
|
||||
return text
|
||||
|
||||
@property
|
||||
@@ -497,10 +605,11 @@ class PreTrainedTokenizer(object):
|
||||
all_ids = list(self.convert_tokens_to_ids(t) for t in all_toks)
|
||||
return all_ids
|
||||
|
||||
|
||||
|
||||
def clean_up_tokenization(out_string):
|
||||
out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
|
||||
).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
|
||||
).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
|
||||
return out_string
|
||||
@staticmethod
|
||||
def clean_up_tokenization(out_string):
|
||||
""" Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
|
||||
"""
|
||||
out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
|
||||
).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
|
||||
).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
|
||||
return out_string
|
||||
|
||||
@@ -23,7 +23,7 @@ from shutil import copyfile
|
||||
import unicodedata
|
||||
import six
|
||||
|
||||
from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user