updating docs - adding few tests to tokenizers
This commit is contained in:
@@ -30,14 +30,34 @@ SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
|
||||
ADDED_TOKENS_FILE = 'added_tokens.json'
|
||||
|
||||
class PreTrainedTokenizer(object):
|
||||
""" An abstract class to handle dowloading and loading pretrained tokenizers and adding tokens to the vocabulary.
|
||||
""" Base class for all tokenizers.
|
||||
Handle all the shared methods for tokenization and special tokens as well as methods dowloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
|
||||
|
||||
Derived class can set up a few special tokens to be used in common scripts and internals:
|
||||
bos_token, eos_token, EOP_TOKEN, EOD_TOKEN, unk_token, sep_token, pad_token, cls_token, mask_token
|
||||
additional_special_tokens = []
|
||||
This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
|
||||
|
||||
We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the
|
||||
specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
|
||||
Class attributes (overridden by derived classes):
|
||||
|
||||
- ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
|
||||
- ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
|
||||
- ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
|
||||
|
||||
Parameters:
|
||||
|
||||
- ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token``
|
||||
|
||||
- ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token``
|
||||
|
||||
- ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token``
|
||||
|
||||
- ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token``
|
||||
|
||||
- ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token``
|
||||
|
||||
- ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token``
|
||||
|
||||
- ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token``
|
||||
|
||||
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens``
|
||||
"""
|
||||
vocab_files_names = {}
|
||||
pretrained_vocab_files_map = {}
|
||||
@@ -49,82 +69,98 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
@property
|
||||
def bos_token(self):
|
||||
""" Beginning of sentence token (string). Log an error if used while not having been set. """
|
||||
if self._bos_token is None:
|
||||
logger.error("Using bos_token, but it is not set yet.")
|
||||
return self._bos_token
|
||||
|
||||
@property
|
||||
def eos_token(self):
|
||||
""" End of sentence token (string). Log an error if used while not having been set. """
|
||||
if self._eos_token is None:
|
||||
logger.error("Using eos_token, but it is not set yet.")
|
||||
return self._eos_token
|
||||
|
||||
@property
|
||||
def unk_token(self):
|
||||
""" Unknown token (string). Log an error if used while not having been set. """
|
||||
if self._unk_token is None:
|
||||
logger.error("Using unk_token, but it is not set yet.")
|
||||
return self._unk_token
|
||||
|
||||
@property
|
||||
def sep_token(self):
|
||||
""" Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
|
||||
if self._sep_token is None:
|
||||
logger.error("Using sep_token, but it is not set yet.")
|
||||
return self._sep_token
|
||||
|
||||
@property
|
||||
def pad_token(self):
|
||||
""" Padding token (string). Log an error if used while not having been set. """
|
||||
if self._pad_token is None:
|
||||
logger.error("Using pad_token, but it is not set yet.")
|
||||
return self._pad_token
|
||||
|
||||
@property
|
||||
def cls_token(self):
|
||||
""" Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
|
||||
if self._cls_token is None:
|
||||
logger.error("Using cls_token, but it is not set yet.")
|
||||
return self._cls_token
|
||||
|
||||
@property
|
||||
def mask_token(self):
|
||||
""" Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
|
||||
if self._mask_token is None:
|
||||
logger.error("Using mask_token, but it is not set yet.")
|
||||
return self._mask_token
|
||||
|
||||
@property
|
||||
def additional_special_tokens(self):
|
||||
""" All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
|
||||
if self._additional_special_tokens is None:
|
||||
logger.error("Using additional_special_tokens, but it is not set yet.")
|
||||
return self._additional_special_tokens
|
||||
|
||||
@bos_token.setter
|
||||
def bos_token(self, value):
|
||||
self.add_tokens([value])
|
||||
self._bos_token = value
|
||||
|
||||
@eos_token.setter
|
||||
def eos_token(self, value):
|
||||
self.add_tokens([value])
|
||||
self._eos_token = value
|
||||
|
||||
@unk_token.setter
|
||||
def unk_token(self, value):
|
||||
self.add_tokens([value])
|
||||
self._unk_token = value
|
||||
|
||||
@sep_token.setter
|
||||
def sep_token(self, value):
|
||||
self.add_tokens([value])
|
||||
self._sep_token = value
|
||||
|
||||
@pad_token.setter
|
||||
def pad_token(self, value):
|
||||
self.add_tokens([value])
|
||||
self._pad_token = value
|
||||
|
||||
@cls_token.setter
|
||||
def cls_token(self, value):
|
||||
self.add_tokens([value])
|
||||
self._cls_token = value
|
||||
|
||||
@mask_token.setter
|
||||
def mask_token(self, value):
|
||||
self.add_tokens([value])
|
||||
self._mask_token = value
|
||||
|
||||
@additional_special_tokens.setter
|
||||
def additional_special_tokens(self, value):
|
||||
self.add_tokens(value)
|
||||
self._additional_special_tokens = value
|
||||
|
||||
def __init__(self, max_len=None, **kwargs):
|
||||
@@ -148,15 +184,47 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *inputs, **kwargs):
|
||||
r""" Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
|
||||
|
||||
Parameters:
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
|
||||
|
||||
cache_dir: (`optional`) string:
|
||||
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
|
||||
|
||||
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
|
||||
|
||||
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
|
||||
|
||||
Examples::
|
||||
|
||||
# We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
|
||||
|
||||
# Download vocabulary from S3 and cache.
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
|
||||
# If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
|
||||
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
|
||||
|
||||
# If the tokenizer uses a single vocabulary file, you can point directly to this file
|
||||
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
|
||||
|
||||
# You can link tokens to special vocabulary when instantiating
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
|
||||
# You should be sure '<unk>' is in the vocabulary when doing that.
|
||||
# Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
|
||||
assert tokenizer.unk_token == '<unk>'
|
||||
|
||||
"""
|
||||
return cls._from_pretrained(*inputs, **kwargs)
|
||||
|
||||
|
||||
@classmethod
|
||||
def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
||||
"""
|
||||
Instantiate a PreTrainedTokenizer from pre-trained vocabulary files.
|
||||
Download and cache the vocabulary files if needed.
|
||||
"""
|
||||
cache_dir = kwargs.pop('cache_dir', None)
|
||||
|
||||
s3_models = list(cls.max_model_input_sizes.keys())
|
||||
@@ -253,8 +321,9 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
def save_pretrained(self, save_directory):
|
||||
""" Save the tokenizer vocabulary files (with added tokens) and the
|
||||
special-tokens-to-class-attributes-mapping to a directory, so that it
|
||||
can be re-loaded using the `from_pretrained(save_directory)` class method.
|
||||
special-tokens-to-class-attributes-mapping to a directory.
|
||||
|
||||
This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
|
||||
"""
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error("Saving directory ({}) should be a directory".format(save_directory))
|
||||
@@ -279,37 +348,50 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
|
||||
def save_vocabulary(self, save_directory):
|
||||
""" Save the tokenizer vocabulary to a directory. This method doesn't save added tokens
|
||||
""" Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
|
||||
and special token mappings.
|
||||
|
||||
Please use `save_pretrained()` to save the full Tokenizer state so that it can be
|
||||
reloaded using the `from_pretrained(save_directory)` class method.
|
||||
|
||||
Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def vocab_size(self):
|
||||
""" Size of the base vocabulary (without the added tokens) """
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def __len__(self):
|
||||
""" Size of the full vocabulary with the added tokens """
|
||||
return self.vocab_size + len(self.added_tokens_encoder)
|
||||
|
||||
|
||||
def add_tokens(self, new_tokens):
|
||||
""" Add a list of new tokens to the tokenizer class. If the new tokens are not in the
|
||||
vocabulary, they are added to the added_tokens_encoder with indices starting from
|
||||
the last index of the current vocabulary.
|
||||
vocabulary, they are added to it with indices starting from length of the current vocabulary.
|
||||
|
||||
Parameters:
|
||||
new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
|
||||
|
||||
Returns:
|
||||
Number of tokens added to the vocabulary which can be used to correspondingly
|
||||
increase the size of the associated model embedding matrices.
|
||||
Number of tokens added to the vocabulary.
|
||||
|
||||
Examples::
|
||||
|
||||
# Let's see how to increase the vocabulary of Bert model and tokenizer
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertModel.from_pretrained('bert-base-uncased')
|
||||
|
||||
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
|
||||
print('We have added', num_added_toks, 'tokens')
|
||||
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
|
||||
"""
|
||||
if not new_tokens:
|
||||
return 0
|
||||
|
||||
to_add_tokens = []
|
||||
for token in new_tokens:
|
||||
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
|
||||
if token != self.unk_token and \
|
||||
self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
|
||||
to_add_tokens.append(token)
|
||||
@@ -325,23 +407,23 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
def add_special_tokens(self, special_tokens_dict):
|
||||
""" Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
|
||||
to class attributes. If the special tokens are not in the vocabulary, they are added
|
||||
to it and indexed starting from the last index of the current vocabulary.
|
||||
to class attributes. If special tokens are NOT in the vocabulary, they are added
|
||||
to it (indexed starting from the last index of the current vocabulary).
|
||||
|
||||
Parameters:
|
||||
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``].
|
||||
|
||||
Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
|
||||
|
||||
Returns:
|
||||
Number of tokens added to the vocabulary which can be used to correspondingly
|
||||
increase the size of the associated model embedding matrices.
|
||||
"""
|
||||
if not special_tokens_dict:
|
||||
return 0
|
||||
|
||||
added_special_tokens = self.add_tokens(special_tokens_dict.values())
|
||||
for key, value in special_tokens_dict.items():
|
||||
assert key in self.SPECIAL_TOKENS_ATTRIBUTES
|
||||
logger.info("Assigning %s to the %s key of the tokenizer", value, key)
|
||||
setattr(self, key, value)
|
||||
|
||||
return added_special_tokens
|
||||
|
||||
|
||||
def tokenize(self, text, **kwargs):
|
||||
""" Converts a string in a sequence of tokens (string), using the tokenizer.
|
||||
@@ -369,13 +451,13 @@ class PreTrainedTokenizer(object):
|
||||
Split in words for word-based vocabulary or sub-words for sub-word-based
|
||||
vocabularies (BPE/SentencePieces/WordPieces).
|
||||
|
||||
Don't take care of added tokens.
|
||||
Do NOT take care of added tokens.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def convert_tokens_to_ids(self, tokens):
|
||||
""" Converts a single token or a sequence of tokens (str/unicode) in a integer id
|
||||
(resp.) a sequence of ids, using the vocabulary.
|
||||
""" Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
|
||||
(resp. a sequence of ids), using the vocabulary.
|
||||
"""
|
||||
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
|
||||
return self._convert_token_to_id_with_added_voc(tokens)
|
||||
@@ -400,7 +482,8 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
def encode(self, text):
|
||||
""" Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
|
||||
same as self.convert_tokens_to_ids(self.tokenize(text)).
|
||||
|
||||
Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
|
||||
"""
|
||||
return self.convert_tokens_to_ids(self.tokenize(text))
|
||||
|
||||
@@ -440,6 +523,8 @@ class PreTrainedTokenizer(object):
|
||||
def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
|
||||
""" Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
|
||||
with options to remove special tokens and clean up tokenization spaces.
|
||||
|
||||
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
|
||||
"""
|
||||
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
||||
text = self.convert_tokens_to_string(filtered_tokens)
|
||||
@@ -482,6 +567,8 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
@staticmethod
|
||||
def clean_up_tokenization(out_string):
|
||||
""" Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
|
||||
"""
|
||||
out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
|
||||
).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
|
||||
).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
|
||||
|
||||
Reference in New Issue
Block a user