Documentation (#2989)
* All Tokenizers BertTokenizer + few fixes RobertaTokenizer OpenAIGPTTokenizer + Fixes GPT2Tokenizer + fixes TransfoXLTokenizer Correct rst for TransformerXL XLMTokenizer + fixes XLNet Tokenizer + Style DistilBERT + Fix XLNet RST CTRLTokenizer CamemBERT Tokenizer FlaubertTokenizer XLMRobertaTokenizer cleanup * cleanup
This commit is contained in:
@@ -41,7 +41,8 @@ AlbertTokenizer
|
|||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.AlbertTokenizer
|
.. autoclass:: transformers.AlbertTokenizer
|
||||||
:members:
|
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
|
||||||
|
create_token_type_ids_from_sequences, save_vocabulary
|
||||||
|
|
||||||
|
|
||||||
AlbertModel
|
AlbertModel
|
||||||
|
|||||||
@@ -46,7 +46,8 @@ BertTokenizer
|
|||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.BertTokenizer
|
.. autoclass:: transformers.BertTokenizer
|
||||||
:members:
|
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
|
||||||
|
create_token_type_ids_from_sequences, save_vocabulary
|
||||||
|
|
||||||
|
|
||||||
BertModel
|
BertModel
|
||||||
|
|||||||
@@ -33,7 +33,8 @@ CamembertTokenizer
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.CamembertTokenizer
|
.. autoclass:: transformers.CamembertTokenizer
|
||||||
:members:
|
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
|
||||||
|
create_token_type_ids_from_sequences, save_vocabulary
|
||||||
|
|
||||||
|
|
||||||
CamembertModel
|
CamembertModel
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ CTRLTokenizer
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.CTRLTokenizer
|
.. autoclass:: transformers.CTRLTokenizer
|
||||||
:members:
|
:members: save_vocabulary
|
||||||
|
|
||||||
|
|
||||||
CTRLModel
|
CTRLModel
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ OpenAIGPTTokenizer
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.OpenAIGPTTokenizer
|
.. autoclass:: transformers.OpenAIGPTTokenizer
|
||||||
:members:
|
:members: save_vocabulary
|
||||||
|
|
||||||
|
|
||||||
OpenAIGPTModel
|
OpenAIGPTModel
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ Overview
|
|||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
OpenAI GPT-2 model was proposed in
|
OpenAI GPT-2 model was proposed in
|
||||||
`Language Models are Unsupervised Multitask Learners`_
|
`Language Models are Unsupervised Multitask Learners <https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_
|
||||||
by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
|
||||||
It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
|
It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
|
||||||
corpus of ~40 GB of text data.
|
corpus of ~40 GB of text data.
|
||||||
@@ -46,7 +46,7 @@ GPT2Tokenizer
|
|||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.GPT2Tokenizer
|
.. autoclass:: transformers.GPT2Tokenizer
|
||||||
:members:
|
:members: save_vocabulary
|
||||||
|
|
||||||
|
|
||||||
GPT2Model
|
GPT2Model
|
||||||
|
|||||||
@@ -39,7 +39,8 @@ RobertaTokenizer
|
|||||||
~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.RobertaTokenizer
|
.. autoclass:: transformers.RobertaTokenizer
|
||||||
:members:
|
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
|
||||||
|
create_token_type_ids_from_sequences, save_vocabulary
|
||||||
|
|
||||||
|
|
||||||
RobertaModel
|
RobertaModel
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ TransfoXLTokenizer
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.TransfoXLTokenizer
|
.. autoclass:: transformers.TransfoXLTokenizer
|
||||||
:members:
|
:members: save_vocabulary
|
||||||
|
|
||||||
|
|
||||||
TransfoXLModel
|
TransfoXLModel
|
||||||
|
|||||||
@@ -41,7 +41,8 @@ XLMTokenizer
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.XLMTokenizer
|
.. autoclass:: transformers.XLMTokenizer
|
||||||
:members:
|
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
|
||||||
|
create_token_type_ids_from_sequences, save_vocabulary
|
||||||
|
|
||||||
XLMModel
|
XLMModel
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|||||||
@@ -39,7 +39,8 @@ XLMRobertaTokenizer
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.XLMRobertaTokenizer
|
.. autoclass:: transformers.XLMRobertaTokenizer
|
||||||
:members:
|
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
|
||||||
|
create_token_type_ids_from_sequences, save_vocabulary
|
||||||
|
|
||||||
|
|
||||||
XLMRobertaModel
|
XLMRobertaModel
|
||||||
|
|||||||
@@ -44,7 +44,8 @@ XLNetTokenizer
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.XLNetTokenizer
|
.. autoclass:: transformers.XLNetTokenizer
|
||||||
:members:
|
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
|
||||||
|
create_token_type_ids_from_sequences, save_vocabulary
|
||||||
|
|
||||||
|
|
||||||
XLNetModel
|
XLNetModel
|
||||||
|
|||||||
@@ -109,6 +109,7 @@ class FlaubertConfig(XLMConfig):
|
|||||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
:class:`~transformers.XLMForSequenceClassification`.
|
:class:`~transformers.XLMForSequenceClassification`.
|
||||||
Is one of the following options:
|
Is one of the following options:
|
||||||
|
|
||||||
- 'last' => take the last token hidden state (like XLNet)
|
- 'last' => take the last token hidden state (like XLNet)
|
||||||
- 'first' => take the first token hidden state (like Bert)
|
- 'first' => take the first token hidden state (like Bert)
|
||||||
- 'mean' => take the mean of all tokens hidden states
|
- 'mean' => take the mean of all tokens hidden states
|
||||||
|
|||||||
@@ -73,6 +73,7 @@ class GPT2Config(PretrainedConfig):
|
|||||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
:class:`~transformers.GPT2DoubleHeadsModel`.
|
:class:`~transformers.GPT2DoubleHeadsModel`.
|
||||||
Is one of the following options:
|
Is one of the following options:
|
||||||
|
|
||||||
- 'last' => take the last token hidden state (like XLNet)
|
- 'last' => take the last token hidden state (like XLNet)
|
||||||
- 'first' => take the first token hidden state (like Bert)
|
- 'first' => take the first token hidden state (like Bert)
|
||||||
- 'mean' => take the mean of all tokens hidden states
|
- 'mean' => take the mean of all tokens hidden states
|
||||||
|
|||||||
@@ -73,6 +73,7 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
:class:`~transformers.OpenAIGPTDoubleHeadsModel`.
|
||||||
Is one of the following options:
|
Is one of the following options:
|
||||||
|
|
||||||
- 'last' => take the last token hidden state (like XLNet)
|
- 'last' => take the last token hidden state (like XLNet)
|
||||||
- 'first' => take the first token hidden state (like Bert)
|
- 'first' => take the first token hidden state (like Bert)
|
||||||
- 'mean' => take the mean of all tokens hidden states
|
- 'mean' => take the mean of all tokens hidden states
|
||||||
|
|||||||
@@ -108,6 +108,7 @@ class XLMConfig(PretrainedConfig):
|
|||||||
Argument used when doing sequence summary. Used in for the multiple choice head in
|
Argument used when doing sequence summary. Used in for the multiple choice head in
|
||||||
:class:`~transformers.XLMForSequenceClassification`.
|
:class:`~transformers.XLMForSequenceClassification`.
|
||||||
Is one of the following options:
|
Is one of the following options:
|
||||||
|
|
||||||
- 'last' => take the last token hidden state (like XLNet)
|
- 'last' => take the last token hidden state (like XLNet)
|
||||||
- 'first' => take the first token hidden state (like Bert)
|
- 'first' => take the first token hidden state (like Bert)
|
||||||
- 'mean' => take the mean of all tokens hidden states
|
- 'mean' => take the mean of all tokens hidden states
|
||||||
|
|||||||
@@ -1230,7 +1230,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||||
loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided):
|
loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
|
||||||
Classification loss.
|
Classification loss.
|
||||||
classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
|
classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
|
||||||
`num_choices` is the second dimension of the input tensors. (see `input_ids` above).
|
`num_choices` is the second dimension of the input tensors. (see `input_ids` above).
|
||||||
|
|||||||
@@ -686,10 +686,11 @@ class TFBertModel(TFBertPreTrainedModel):
|
|||||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
|
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
|
||||||
tuple of :obj:`tf.Tensor` (one for each layer) of shape
|
tuple of :obj:`tf.Tensor` (one for each layer) of shape
|
||||||
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
|
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
|
||||||
|
|
||||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
|
||||||
|
|
||||||
Examples::
|
Examples::
|
||||||
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
from .tokenization_utils import PreTrainedTokenizer
|
from .tokenization_utils import PreTrainedTokenizer
|
||||||
|
|
||||||
@@ -55,9 +56,55 @@ SPIECE_UNDERLINE = "▁"
|
|||||||
|
|
||||||
class AlbertTokenizer(PreTrainedTokenizer):
|
class AlbertTokenizer(PreTrainedTokenizer):
|
||||||
"""
|
"""
|
||||||
SentencePiece based tokenizer. Peculiarities:
|
Constructs an ALBERT tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__
|
||||||
|
|
||||||
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||||
|
should refer to the superclass for more information regarding methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`string`):
|
||||||
|
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
|
||||||
|
contains the vocabulary necessary to instantiate a tokenizer.
|
||||||
|
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether to lowercase the input when tokenizing.
|
||||||
|
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
|
||||||
|
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to keep accents when tokenizing.
|
||||||
|
bos_token (:obj:`string`, `optional`, defaults to "[CLS]"):
|
||||||
|
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the beginning
|
||||||
|
of sequence. The token used is the :obj:`cls_token`.
|
||||||
|
eos_token (:obj:`string`, `optional`, defaults to "[SEP]"):
|
||||||
|
The end of sequence token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the end
|
||||||
|
of sequence. The token used is the :obj:`sep_token`.
|
||||||
|
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
|
||||||
|
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
|
||||||
|
for sequence classification or for a text and a question for question answering.
|
||||||
|
It is also used as the last token of a sequence built with special tokens.
|
||||||
|
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
|
||||||
|
The token used for padding, for example when batching sequences of different lengths.
|
||||||
|
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
|
||||||
|
The classifier token which is used when doing sequence classification (classification of the whole
|
||||||
|
sequence instead of per-token classification). It is the first token of the sequence when built with
|
||||||
|
special tokens.
|
||||||
|
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
|
||||||
|
The token used for masking values. This is the token used when training this model with masked language
|
||||||
|
modeling. This is the token which the model will try to predict.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
sp_model (:obj:`SentencePieceProcessor`):
|
||||||
|
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -185,17 +232,28 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
return self.sp_model.IdToPiece(index)
|
return self.sp_model.IdToPiece(index)
|
||||||
|
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
|
||||||
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
|
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
def build_inputs_with_special_tokens(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
by concatenating and adding special tokens.
|
by concatenating and adding special tokens.
|
||||||
An ALBERT sequence has the following format:
|
An ALBERT sequence has the following format:
|
||||||
single sequence: [CLS] X [SEP]
|
|
||||||
pair of sequences: [CLS] A [SEP] B [SEP]
|
- single sequence: ``[CLS] X [SEP]``
|
||||||
|
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of IDs to which the special tokens will be added
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
@@ -203,27 +261,30 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
return cls + token_ids_0 + sep
|
return cls + token_ids_0 + sep
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
def get_special_tokens_mask(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_ids_0: list of ids (must not contain special tokens)
|
token_ids_0 (:obj:`List[int]`):
|
||||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
List of ids.
|
||||||
for sequence pairs
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
Optional second list of IDs for sequence pairs.
|
||||||
special tokens for the model
|
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Set to True if the token list is already formatted with special tokens for the model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if already_has_special_tokens:
|
if already_has_special_tokens:
|
||||||
if token_ids_1 is not None:
|
if token_ids_1 is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"You should not supply a second sequence if the provided sequence of "
|
"You should not supply a second sequence if the provided sequence of "
|
||||||
"ids is already formated with special tokens for the model."
|
"ids is already formatted with special tokens for the model."
|
||||||
)
|
)
|
||||||
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
||||||
|
|
||||||
@@ -231,14 +292,29 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
def create_token_type_ids_from_sequences(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
An ALBERT sequence pair mask has the following format:
|
An ALBERT sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence
|
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
::
|
||||||
|
|
||||||
|
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
||||||
|
| first sequence | second sequence |
|
||||||
|
|
||||||
|
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of ids.
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
|
||||||
|
sequence(s).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
@@ -248,8 +324,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
"""
|
||||||
to a directory.
|
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (:obj:`str`):
|
||||||
|
The directory in which to save the vocabulary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Tuple(str)`: Paths to the files saved.
|
||||||
"""
|
"""
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import collections
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
from tokenizers import BertWordPieceTokenizer
|
from tokenizers import BertWordPieceTokenizer
|
||||||
|
|
||||||
@@ -117,17 +118,41 @@ def whitespace_tokenize(text):
|
|||||||
|
|
||||||
class BertTokenizer(PreTrainedTokenizer):
|
class BertTokenizer(PreTrainedTokenizer):
|
||||||
r"""
|
r"""
|
||||||
Constructs a BertTokenizer.
|
Constructs a BERT tokenizer. Based on WordPiece.
|
||||||
:class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
|
|
||||||
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||||
|
should refer to the superclass for more information regarding methods.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_file: Path to a one-wordpiece-per-line vocabulary file
|
vocab_file (:obj:`string`):
|
||||||
do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
|
File containing the vocabulary.
|
||||||
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
|
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
|
Whether to lowercase the input when tokenizing.
|
||||||
minimum of this value (if specified) and the underlying BERT model's sequence length.
|
do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
never_split: List of tokens which will never be split during tokenization. Only has an effect when
|
Whether to do basic tokenization before WordPiece.
|
||||||
do_basic_tokenize=True
|
never_split (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
List of tokens which will never be split during tokenization. Only has an effect when
|
||||||
|
:obj:`do_basic_tokenize=True`
|
||||||
|
unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
|
||||||
|
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
|
||||||
|
for sequence classification or for a text and a question for question answering.
|
||||||
|
It is also used as the last token of a sequence built with special tokens.
|
||||||
|
pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
|
||||||
|
The token used for padding, for example when batching sequences of different lengths.
|
||||||
|
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
|
||||||
|
The classifier token which is used when doing sequence classification (classification of the whole
|
||||||
|
sequence instead of per-token classification). It is the first token of the sequence when built with
|
||||||
|
special tokens.
|
||||||
|
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
|
||||||
|
The token used for masking values. This is the token used when training this model with masked language
|
||||||
|
modeling. This is the token which the model will try to predict.
|
||||||
|
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether to tokenize Chinese characters.
|
||||||
|
This should likely be deactivated for Japanese:
|
||||||
|
see: https://github.com/huggingface/transformers/issues/328
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -149,23 +174,6 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
tokenize_chinese_chars=True,
|
tokenize_chinese_chars=True,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""Constructs a BertTokenizer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file
|
|
||||||
**do_lower_case**: (`optional`) boolean (default True)
|
|
||||||
Whether to lower case the input
|
|
||||||
Only has an effect when do_basic_tokenize=True
|
|
||||||
**do_basic_tokenize**: (`optional`) boolean (default True)
|
|
||||||
Whether to do basic tokenization before wordpiece.
|
|
||||||
**never_split**: (`optional`) list of string
|
|
||||||
List of tokens which will never be split during tokenization.
|
|
||||||
Only has an effect when do_basic_tokenize=True
|
|
||||||
**tokenize_chinese_chars**: (`optional`) boolean (default True)
|
|
||||||
Whether to tokenize Chinese characters.
|
|
||||||
This should likely be deactivated for Japanese:
|
|
||||||
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
|
|
||||||
"""
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
sep_token=sep_token,
|
sep_token=sep_token,
|
||||||
@@ -221,13 +229,25 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
out_string = " ".join(tokens).replace(" ##", "").strip()
|
out_string = " ".join(tokens).replace(" ##", "").strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
def build_inputs_with_special_tokens(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
by concatenating and adding special tokens.
|
by concatenating and adding special tokens.
|
||||||
A BERT sequence has the following format:
|
A BERT sequence has the following format:
|
||||||
single sequence: [CLS] X [SEP]
|
|
||||||
pair of sequences: [CLS] A [SEP] B [SEP]
|
- single sequence: ``[CLS] X [SEP]``
|
||||||
|
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of IDs to which the special tokens will be added
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
||||||
"""
|
"""
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
@@ -235,20 +255,23 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
def get_special_tokens_mask(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_ids_0: list of ids (must not contain special tokens)
|
token_ids_0 (:obj:`List[int]`):
|
||||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
List of ids.
|
||||||
for sequence pairs
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
Optional second list of IDs for sequence pairs.
|
||||||
special tokens for the model
|
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Set to True if the token list is already formatted with special tokens for the model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if already_has_special_tokens:
|
if already_has_special_tokens:
|
||||||
@@ -263,14 +286,29 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
def create_token_type_ids_from_sequences(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A BERT sequence pair mask has the following format:
|
A BERT sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence
|
::
|
||||||
|
|
||||||
|
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
||||||
|
| first sequence | second sequence |
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of ids.
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
|
||||||
|
sequence(s).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
@@ -279,7 +317,16 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
|
|
||||||
def save_vocabulary(self, vocab_path):
|
def save_vocabulary(self, vocab_path):
|
||||||
"""Save the tokenizer vocabulary to a directory or file."""
|
"""
|
||||||
|
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_path (:obj:`str`):
|
||||||
|
The directory in which to save the vocabulary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Tuple(str)`: Paths to the files saved.
|
||||||
|
"""
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(vocab_path):
|
if os.path.isdir(vocab_path):
|
||||||
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
|
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
import sentencepiece as spm
|
import sentencepiece as spm
|
||||||
|
|
||||||
@@ -54,6 +55,49 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
|||||||
SentencePiece based tokenizer. Peculiarities:
|
SentencePiece based tokenizer. Peculiarities:
|
||||||
|
|
||||||
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
|
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
|
||||||
|
|
||||||
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||||
|
should refer to the superclass for more information regarding methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`str`):
|
||||||
|
Path to the vocabulary file.
|
||||||
|
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
|
||||||
|
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the beginning
|
||||||
|
of sequence. The token used is the :obj:`cls_token`.
|
||||||
|
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
|
||||||
|
The end of sequence token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the end
|
||||||
|
of sequence. The token used is the :obj:`sep_token`.
|
||||||
|
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
|
||||||
|
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
|
||||||
|
for sequence classification or for a text and a question for question answering.
|
||||||
|
It is also used as the last token of a sequence built with special tokens.
|
||||||
|
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
|
||||||
|
The classifier token which is used when doing sequence classification (classification of the whole
|
||||||
|
sequence instead of per-token classification). It is the first token of the sequence when built with
|
||||||
|
special tokens.
|
||||||
|
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
|
||||||
|
The token used for padding, for example when batching sequences of different lengths.
|
||||||
|
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
|
||||||
|
The token used for masking values. This is the token used when training this model with masked language
|
||||||
|
modeling. This is the token which the model will try to predict.
|
||||||
|
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||||
|
Additional special tokens used by the tokenizer.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
sp_model (:obj:`SentencePieceProcessor`):
|
||||||
|
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -97,34 +141,50 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
|||||||
self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
|
self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
|
||||||
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
|
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
|
||||||
|
|
||||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
def build_inputs_with_special_tokens(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
by concatenating and adding special tokens.
|
by concatenating and adding special tokens.
|
||||||
A RoBERTa sequence has the following format:
|
A CamemBERT sequence has the following format:
|
||||||
single sequence: <s> X </s>
|
|
||||||
pair of sequences: <s> A </s></s> B </s>
|
- single sequence: ``<s> X </s>``
|
||||||
|
- pair of sequences: ``<s> A </s></s> B </s>``
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of IDs to which the special tokens will be added
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
def get_special_tokens_mask(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_ids_0: list of ids (must not contain special tokens)
|
token_ids_0 (:obj:`List[int]`):
|
||||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
List of ids.
|
||||||
for sequence pairs
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
Optional second list of IDs for sequence pairs.
|
||||||
special tokens for the model
|
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Set to True if the token list is already formatted with special tokens for the model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
"""
|
"""
|
||||||
if already_has_special_tokens:
|
if already_has_special_tokens:
|
||||||
if token_ids_1 is not None:
|
if token_ids_1 is not None:
|
||||||
@@ -138,14 +198,29 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
def create_token_type_ids_from_sequences(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
A RoBERTa sequence pair mask has the following format:
|
A CamemBERT sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence
|
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
::
|
||||||
|
|
||||||
|
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
||||||
|
| first sequence | | second sequence |
|
||||||
|
|
||||||
|
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of ids.
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
|
||||||
|
sequence(s).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
@@ -200,8 +275,15 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
|||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
"""
|
||||||
to a directory.
|
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (:obj:`str`):
|
||||||
|
The directory in which to save the vocabulary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Tuple(str)`: Paths to the files saved.
|
||||||
"""
|
"""
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
|
|||||||
@@ -116,8 +116,21 @@ def get_pairs(word):
|
|||||||
|
|
||||||
class CTRLTokenizer(PreTrainedTokenizer):
|
class CTRLTokenizer(PreTrainedTokenizer):
|
||||||
"""
|
"""
|
||||||
CTRL BPE tokenizer. Peculiarities:
|
Constructs a CTRL tokenizer. Peculiarities:
|
||||||
|
|
||||||
- Byte-Pair-Encoding
|
- Byte-Pair-Encoding
|
||||||
|
|
||||||
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||||
|
should refer to the superclass for more information regarding methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`str`):
|
||||||
|
Path to the vocabulary file.
|
||||||
|
merges_file (:obj:`str`):
|
||||||
|
Path to the merges file.
|
||||||
|
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -219,7 +232,16 @@ class CTRLTokenizer(PreTrainedTokenizer):
|
|||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
"""Save the tokenizer vocabulary and merge files to a directory."""
|
"""
|
||||||
|
Save the vocabulary and special tokens file to a directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (:obj:`str`):
|
||||||
|
The directory in which to save the vocabulary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Tuple(str)`: Paths to the files saved.
|
||||||
|
"""
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -58,16 +58,11 @@ PRETRAINED_INIT_CONFIGURATION = {
|
|||||||
class DistilBertTokenizer(BertTokenizer):
|
class DistilBertTokenizer(BertTokenizer):
|
||||||
r"""
|
r"""
|
||||||
Constructs a DistilBertTokenizer.
|
Constructs a DistilBertTokenizer.
|
||||||
:class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
|
:class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
|
||||||
|
tokenization: punctuation splitting + wordpiece.
|
||||||
|
|
||||||
Args:
|
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
|
||||||
vocab_file: Path to a one-wordpiece-per-line vocabulary file
|
parameters.
|
||||||
do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
|
|
||||||
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
|
|
||||||
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
|
|
||||||
minimum of this value (if specified) and the underlying BERT model's sequence length.
|
|
||||||
never_split: List of tokens which will never be split during tokenization. Only has an effect when
|
|
||||||
do_basic_tokenize=True
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -81,13 +81,13 @@ class FlaubertTokenizer(XLMTokenizer):
|
|||||||
BPE tokenizer for Flaubert
|
BPE tokenizer for Flaubert
|
||||||
|
|
||||||
- Moses preprocessing & tokenization
|
- Moses preprocessing & tokenization
|
||||||
|
|
||||||
- Normalize all inputs text
|
- Normalize all inputs text
|
||||||
|
|
||||||
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
|
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
|
||||||
(ex: "__classify__") to a vocabulary
|
(ex: "__classify__") to a vocabulary
|
||||||
|
|
||||||
- `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
|
- `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
|
||||||
|
|
||||||
|
This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
|
||||||
|
and documentation regarding arguments.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -101,11 +101,35 @@ def get_pairs(word):
|
|||||||
class GPT2Tokenizer(PreTrainedTokenizer):
|
class GPT2Tokenizer(PreTrainedTokenizer):
|
||||||
"""
|
"""
|
||||||
GPT-2 BPE tokenizer. Peculiarities:
|
GPT-2 BPE tokenizer. Peculiarities:
|
||||||
|
|
||||||
- Byte-level Byte-Pair-Encoding
|
- Byte-level Byte-Pair-Encoding
|
||||||
- Requires a space to start the input string => the encoding and tokenize methods should be called with the
|
- Requires a space to start the input string => the encoding methods should be called with the
|
||||||
``add_prefix_space`` flag set to ``True``.
|
``add_prefix_space`` flag set to ``True``.
|
||||||
Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
|
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
|
||||||
the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"`
|
the absence of a space at the beginning of a string:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||||
|
|
||||||
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||||
|
should refer to the superclass for more information regarding methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`str`):
|
||||||
|
Path to the vocabulary file.
|
||||||
|
merges_file (:obj:`str`):
|
||||||
|
Path to the merges file.
|
||||||
|
errors (:obj:`str`, `optional`, defaults to "replace"):
|
||||||
|
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
|
||||||
|
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
|
||||||
|
unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
|
||||||
|
The beginning of sequence token.
|
||||||
|
eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
|
||||||
|
The end of sequence token.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -219,7 +243,16 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
"""Save the tokenizer vocabulary and merge files to a directory."""
|
"""
|
||||||
|
Save the vocabulary and special tokens file to a directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (:obj:`str`):
|
||||||
|
The directory in which to save the vocabulary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Tuple(str)`: Paths to the files saved.
|
||||||
|
"""
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -82,8 +82,21 @@ def text_standardize(text):
|
|||||||
class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
||||||
"""
|
"""
|
||||||
BPE tokenizer. Peculiarities:
|
BPE tokenizer. Peculiarities:
|
||||||
|
|
||||||
- lower case all inputs
|
- lower case all inputs
|
||||||
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
|
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
|
||||||
|
|
||||||
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||||
|
should refer to the superclass for more information regarding methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`str`):
|
||||||
|
Path to the vocabulary file.
|
||||||
|
merges_file (:obj:`str`):
|
||||||
|
Path to the merges file.
|
||||||
|
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -201,7 +214,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
|||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
"""Save the tokenizer vocabulary and merge files to a directory."""
|
"""
|
||||||
|
Save the vocabulary and special tokens file to a directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (:obj:`str`):
|
||||||
|
The directory in which to save the vocabulary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Tuple(str)`: Paths to the files saved.
|
||||||
|
"""
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
from tokenizers.processors import RobertaProcessing
|
from tokenizers.processors import RobertaProcessing
|
||||||
|
|
||||||
@@ -60,12 +61,59 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
|
|
||||||
class RobertaTokenizer(GPT2Tokenizer):
|
class RobertaTokenizer(GPT2Tokenizer):
|
||||||
"""
|
"""
|
||||||
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
|
Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
|
||||||
|
|
||||||
- Byte-level Byte-Pair-Encoding
|
- Byte-level Byte-Pair-Encoding
|
||||||
- Requires a space to start the input string => the encoding methods should be called with the
|
- Requires a space to start the input string => the encoding methods should be called with the
|
||||||
``add_prefix_space`` flag set to ``True``.
|
``add_prefix_space`` flag set to ``True``.
|
||||||
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
|
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
|
||||||
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
|
the absence of a space at the beginning of a string:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||||
|
|
||||||
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||||
|
should refer to the superclass for more information regarding methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`str`):
|
||||||
|
Path to the vocabulary file.
|
||||||
|
merges_file (:obj:`str`):
|
||||||
|
Path to the merges file.
|
||||||
|
errors (:obj:`str`, `optional`, defaults to "replace"):
|
||||||
|
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
|
||||||
|
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
|
||||||
|
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
|
||||||
|
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the beginning
|
||||||
|
of sequence. The token used is the :obj:`cls_token`.
|
||||||
|
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
|
||||||
|
The end of sequence token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the end
|
||||||
|
of sequence. The token used is the :obj:`sep_token`.
|
||||||
|
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
|
||||||
|
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
|
||||||
|
for sequence classification or for a text and a question for question answering.
|
||||||
|
It is also used as the last token of a sequence built with special tokens.
|
||||||
|
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
|
||||||
|
The classifier token which is used when doing sequence classification (classification of the whole
|
||||||
|
sequence instead of per-token classification). It is the first token of the sequence when built with
|
||||||
|
special tokens.
|
||||||
|
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
|
||||||
|
The token used for padding, for example when batching sequences of different lengths.
|
||||||
|
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
|
||||||
|
The token used for masking values. This is the token used when training this model with masked language
|
||||||
|
modeling. This is the token which the model will try to predict.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -102,13 +150,25 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||||
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
|
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
|
||||||
|
|
||||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
def build_inputs_with_special_tokens(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
by concatenating and adding special tokens.
|
by concatenating and adding special tokens.
|
||||||
A RoBERTa sequence has the following format:
|
A RoBERTa sequence has the following format:
|
||||||
single sequence: <s> X </s>
|
|
||||||
pair of sequences: <s> A </s></s> B </s>
|
- single sequence: ``<s> X </s>``
|
||||||
|
- pair of sequences: ``<s> A </s></s> B </s>``
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of IDs to which the special tokens will be added
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
||||||
"""
|
"""
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
@@ -116,20 +176,23 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
def get_special_tokens_mask(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_ids_0: list of ids (must not contain special tokens)
|
token_ids_0 (:obj:`List[int]`):
|
||||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
List of ids.
|
||||||
for sequence pairs
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
Optional second list of IDs for sequence pairs.
|
||||||
special tokens for the model
|
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Set to True if the token list is already formatted with special tokens for the model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
"""
|
"""
|
||||||
if already_has_special_tokens:
|
if already_has_special_tokens:
|
||||||
if token_ids_1 is not None:
|
if token_ids_1 is not None:
|
||||||
@@ -143,12 +206,22 @@ class RobertaTokenizer(GPT2Tokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
def create_token_type_ids_from_sequences(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
|
RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of ids.
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: List of zeros.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
|||||||
@@ -72,6 +72,9 @@ CORPUS_NAME = "corpus.bin"
|
|||||||
class TransfoXLTokenizer(PreTrainedTokenizer):
|
class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||||
"""
|
"""
|
||||||
Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
|
Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
|
||||||
|
|
||||||
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||||
|
should refer to the superclass for more information regarding methods.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -189,7 +192,16 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
|||||||
raise ValueError("No <unkown> token in vocabulary")
|
raise ValueError("No <unkown> token in vocabulary")
|
||||||
|
|
||||||
def save_vocabulary(self, vocab_path):
|
def save_vocabulary(self, vocab_path):
|
||||||
"""Save the tokenizer vocabulary to a directory or file."""
|
"""
|
||||||
|
Save the vocabulary and special tokens file to a directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_path (:obj:`str`):
|
||||||
|
The directory in which to save the vocabulary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Tuple(str)`: Paths to the files saved.
|
||||||
|
"""
|
||||||
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Please note you will not be able to load the save vocabulary in"
|
"Please note you will not be able to load the save vocabulary in"
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
import sacremoses as sm
|
import sacremoses as sm
|
||||||
|
|
||||||
@@ -531,19 +532,58 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
BPE tokenizer for XLM
|
BPE tokenizer for XLM
|
||||||
|
|
||||||
- Moses preprocessing & tokenization for most supported languages
|
- Moses preprocessing & tokenization for most supported languages
|
||||||
|
|
||||||
- Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
|
- Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
|
||||||
|
|
||||||
- (optionally) lower case & normalize all inputs text
|
- (optionally) lower case & normalize all inputs text
|
||||||
|
|
||||||
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
|
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
|
||||||
(ex: "__classify__") to a vocabulary
|
(ex: "__classify__") to a vocabulary
|
||||||
|
|
||||||
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
|
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
|
||||||
|
|
||||||
- `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
|
- `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
|
||||||
|
|
||||||
- `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||||
|
should refer to the superclass for more information regarding methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`string`):
|
||||||
|
Vocabulary file.
|
||||||
|
merges_file (:obj:`string`):
|
||||||
|
Merges file.
|
||||||
|
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether to lowercase the input when tokenizing.
|
||||||
|
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
|
||||||
|
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to keep accents when tokenizing.
|
||||||
|
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
|
||||||
|
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the beginning
|
||||||
|
of sequence. The token used is the :obj:`cls_token`.
|
||||||
|
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
|
||||||
|
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
|
||||||
|
for sequence classification or for a text and a question for question answering.
|
||||||
|
It is also used as the last token of a sequence built with special tokens.
|
||||||
|
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
|
||||||
|
The token used for padding, for example when batching sequences of different lengths.
|
||||||
|
cls_token (:obj:`string`, `optional`, defaults to "</s>"):
|
||||||
|
The classifier token which is used when doing sequence classification (classification of the whole
|
||||||
|
sequence instead of per-token classification). It is the first token of the sequence when built with
|
||||||
|
special tokens.
|
||||||
|
mask_token (:obj:`string`, `optional`, defaults to "<special1>"):
|
||||||
|
The token used for masking values. This is the token used when training this model with masked language
|
||||||
|
modeling. This is the token which the model will try to predict.
|
||||||
|
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
|
||||||
|
List of additional special tokens.
|
||||||
|
lang2id (:obj:`Dict[str, int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Dictionary mapping languages string identifiers to their IDs.
|
||||||
|
id2lang (:obj:`Dict[int, str`, `optional`, defaults to :obj:`None`):
|
||||||
|
Dictionary mapping language IDs to their string identifiers.
|
||||||
|
do_lowercase_and_remove_accent (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether to lowercase and remove accents when tokenizing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -812,13 +852,26 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
out_string = "".join(tokens).replace("</w>", " ").strip()
|
out_string = "".join(tokens).replace("</w>", " ").strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
def build_inputs_with_special_tokens(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
by concatenating and adding special tokens.
|
by concatenating and adding special tokens.
|
||||||
A XLM sequence has the following format:
|
A XLM sequence has the following format:
|
||||||
single sequence: <s> X </s>
|
|
||||||
pair of sequences: <s> A </s> B </s>
|
- single sequence: ``<s> X </s>``
|
||||||
|
- pair of sequences: ``<s> A </s> B </s>``
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of IDs to which the special tokens will be added
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
@@ -826,20 +879,23 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
def get_special_tokens_mask(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_ids_0: list of ids (must not contain special tokens)
|
token_ids_0 (:obj:`List[int]`):
|
||||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
List of ids.
|
||||||
for sequence pairs
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
Optional second list of IDs for sequence pairs.
|
||||||
special tokens for the model
|
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Set to True if the token list is already formatted with special tokens for the model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if already_has_special_tokens:
|
if already_has_special_tokens:
|
||||||
@@ -854,14 +910,29 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
def create_token_type_ids_from_sequences(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
An XLM sequence pair mask has the following format:
|
An XLM sequence pair mask has the following format:
|
||||||
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence
|
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
::
|
||||||
|
|
||||||
|
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
||||||
|
| first sequence | second sequence |
|
||||||
|
|
||||||
|
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of ids.
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
|
||||||
|
sequence(s).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
@@ -870,7 +941,16 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
"""Save the tokenizer vocabulary and merge files to a directory."""
|
"""
|
||||||
|
Save the vocabulary and special tokens file to a directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (:obj:`str`):
|
||||||
|
The directory in which to save the vocabulary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Tuple(str)`: Paths to the files saved.
|
||||||
|
"""
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
from transformers.tokenization_utils import PreTrainedTokenizer
|
from transformers.tokenization_utils import PreTrainedTokenizer
|
||||||
|
|
||||||
@@ -55,6 +56,49 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
|||||||
SentencePiece based tokenizer. Peculiarities:
|
SentencePiece based tokenizer. Peculiarities:
|
||||||
|
|
||||||
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
|
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
|
||||||
|
|
||||||
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||||
|
should refer to the superclass for more information regarding methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`str`):
|
||||||
|
Path to the vocabulary file.
|
||||||
|
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
|
||||||
|
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the beginning
|
||||||
|
of sequence. The token used is the :obj:`cls_token`.
|
||||||
|
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
|
||||||
|
The end of sequence token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the end
|
||||||
|
of sequence. The token used is the :obj:`sep_token`.
|
||||||
|
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
|
||||||
|
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
|
||||||
|
for sequence classification or for a text and a question for question answering.
|
||||||
|
It is also used as the last token of a sequence built with special tokens.
|
||||||
|
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
|
||||||
|
The classifier token which is used when doing sequence classification (classification of the whole
|
||||||
|
sequence instead of per-token classification). It is the first token of the sequence when built with
|
||||||
|
special tokens.
|
||||||
|
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
|
||||||
|
The token used for padding, for example when batching sequences of different lengths.
|
||||||
|
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
|
||||||
|
The token used for masking values. This is the token used when training this model with masked language
|
||||||
|
modeling. This is the token which the model will try to predict.
|
||||||
|
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||||
|
Additional special tokens used by the tokenizer.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
sp_model (:obj:`SentencePieceProcessor`):
|
||||||
|
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -132,35 +176,52 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
|||||||
self.sp_model = spm.SentencePieceProcessor()
|
self.sp_model = spm.SentencePieceProcessor()
|
||||||
self.sp_model.Load(self.vocab_file)
|
self.sp_model.Load(self.vocab_file)
|
||||||
|
|
||||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
def build_inputs_with_special_tokens(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
by concatenating and adding special tokens.
|
by concatenating and adding special tokens.
|
||||||
A RoBERTa sequence has the following format:
|
A XLM-R sequence has the following format:
|
||||||
single sequence: <s> X </s>
|
|
||||||
pair of sequences: <s> A </s></s> B </s>
|
- single sequence: ``<s> X </s>``
|
||||||
|
- pair of sequences: ``<s> A </s></s> B </s>``
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of IDs to which the special tokens will be added
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
def get_special_tokens_mask(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_ids_0: list of ids (must not contain special tokens)
|
token_ids_0 (:obj:`List[int]`):
|
||||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
List of ids.
|
||||||
for sequence pairs
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
Optional second list of IDs for sequence pairs.
|
||||||
special tokens for the model
|
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Set to True if the token list is already formatted with special tokens for the model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if already_has_special_tokens:
|
if already_has_special_tokens:
|
||||||
if token_ids_1 is not None:
|
if token_ids_1 is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -173,12 +234,24 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
def create_token_type_ids_from_sequences(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
|
XLM-R does not make use of token type ids, therefore a list of zeros is returned.
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of ids.
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: List of zeros.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
|
|
||||||
@@ -216,8 +289,15 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
|||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
"""
|
||||||
to a directory.
|
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (:obj:`str`):
|
||||||
|
The directory in which to save the vocabulary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Tuple(str)`: Paths to the files saved.
|
||||||
"""
|
"""
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
from .tokenization_utils import PreTrainedTokenizer
|
from .tokenization_utils import PreTrainedTokenizer
|
||||||
|
|
||||||
@@ -51,9 +52,57 @@ SEG_ID_PAD = 4
|
|||||||
|
|
||||||
class XLNetTokenizer(PreTrainedTokenizer):
|
class XLNetTokenizer(PreTrainedTokenizer):
|
||||||
"""
|
"""
|
||||||
SentencePiece based tokenizer. Peculiarities:
|
Constructs an XLNet tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__
|
||||||
|
|
||||||
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||||
|
should refer to the superclass for more information regarding methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`string`):
|
||||||
|
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
|
||||||
|
contains the vocabulary necessary to instantiate a tokenizer.
|
||||||
|
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether to lowercase the input when tokenizing.
|
||||||
|
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
|
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
|
||||||
|
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to keep accents when tokenizing.
|
||||||
|
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
|
||||||
|
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the beginning
|
||||||
|
of sequence. The token used is the :obj:`cls_token`.
|
||||||
|
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
|
||||||
|
The end of sequence token.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the end
|
||||||
|
of sequence. The token used is the :obj:`sep_token`.
|
||||||
|
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
sep_token (:obj:`string`, `optional`, defaults to "<sep>"):
|
||||||
|
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
|
||||||
|
for sequence classification or for a text and a question for question answering.
|
||||||
|
It is also used as the last token of a sequence built with special tokens.
|
||||||
|
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
|
||||||
|
The token used for padding, for example when batching sequences of different lengths.
|
||||||
|
cls_token (:obj:`string`, `optional`, defaults to "<cls>"):
|
||||||
|
The classifier token which is used when doing sequence classification (classification of the whole
|
||||||
|
sequence instead of per-token classification). It is the first token of the sequence when built with
|
||||||
|
special tokens.
|
||||||
|
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
|
||||||
|
The token used for masking values. This is the token used when training this model with masked language
|
||||||
|
modeling. This is the token which the model will try to predict.
|
||||||
|
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
|
||||||
|
Additional special tokens used by the tokenizer.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
sp_model (:obj:`SentencePieceProcessor`):
|
||||||
|
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -189,13 +238,25 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
|
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
def build_inputs_with_special_tokens(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
||||||
by concatenating and adding special tokens.
|
by concatenating and adding special tokens.
|
||||||
An XLNet sequence has the following format:
|
An XLNet sequence has the following format:
|
||||||
single sequence: X <sep> <cls>
|
|
||||||
pair of sequences: A <sep> B <sep> <cls>
|
- single sequence: ``X <sep> <cls>``
|
||||||
|
- pair of sequences: ``A <sep> B <sep> <cls>``
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of IDs to which the special tokens will be added
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls = [self.cls_token_id]
|
cls = [self.cls_token_id]
|
||||||
@@ -203,20 +264,23 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
return token_ids_0 + sep + cls
|
return token_ids_0 + sep + cls
|
||||||
return token_ids_0 + sep + token_ids_1 + sep + cls
|
return token_ids_0 + sep + token_ids_1 + sep + cls
|
||||||
|
|
||||||
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
|
def get_special_tokens_mask(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_ids_0: list of ids (must not contain special tokens)
|
token_ids_0 (:obj:`List[int]`):
|
||||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
List of ids.
|
||||||
for sequence pairs
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
already_has_special_tokens: (default False) Set to True if the token list is already formated with
|
Optional second list of IDs for sequence pairs.
|
||||||
special tokens for the model
|
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Set to True if the token list is already formatted with special tokens for the model
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if already_has_special_tokens:
|
if already_has_special_tokens:
|
||||||
@@ -231,7 +295,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
|
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
|
||||||
return ([0] * len(token_ids_0)) + [1, 1]
|
return ([0] * len(token_ids_0)) + [1, 1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
def create_token_type_ids_from_sequences(
|
||||||
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||||
An XLNet sequence pair mask has the following format:
|
An XLNet sequence pair mask has the following format:
|
||||||
@@ -239,6 +305,16 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
| first sequence | second sequence | CLS segment ID
|
| first sequence | second sequence | CLS segment ID
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token_ids_0 (:obj:`List[int]`):
|
||||||
|
List of ids.
|
||||||
|
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
||||||
|
Optional second list of IDs for sequence pairs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
|
||||||
|
sequence(s).
|
||||||
"""
|
"""
|
||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
cls_segment_id = [2]
|
cls_segment_id = [2]
|
||||||
@@ -248,8 +324,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
|||||||
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
|
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory):
|
def save_vocabulary(self, save_directory):
|
||||||
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
|
"""
|
||||||
to a directory.
|
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (:obj:`str`):
|
||||||
|
The directory in which to save the vocabulary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Tuple(str)`: Paths to the files saved.
|
||||||
"""
|
"""
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
|
|||||||
Reference in New Issue
Block a user