Cleanup fast tokenizers integration (#3706)
* First pass on utility classes and python tokenizers * finishing cleanup pass * style and quality * Fix tests * Updating following @mfuntowicz comment * style and quality * Fix Roberta * fix batch_size/seq_length inBatchEncoding * add alignement methods + tests * Fix OpenAI and Transfo-XL tokenizers * adding trim_offsets=True default for GPT2 et RoBERTa * style and quality * fix tests * add_prefix_space in roberta * bump up tokenizers to rc7 * style * unfortunately tensorfow does like these - removing shape/seq_len for now * Update src/transformers/tokenization_utils.py Co-Authored-By: Stefan Schweter <stefan@schweter.it> * Adding doc and docstrings * making flake8 happy Co-authored-by: Stefan Schweter <stefan@schweter.it>
This commit is contained in:
@@ -137,9 +137,6 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
|
||||
|
||||
try:
|
||||
import sentencepiece as spm
|
||||
except ImportError:
|
||||
|
||||
@@ -182,8 +182,6 @@ class BertTokenizer(PreTrainedTokenizer):
|
||||
mask_token=mask_token,
|
||||
**kwargs,
|
||||
)
|
||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
@@ -583,6 +581,48 @@ def _is_punctuation(char):
|
||||
|
||||
|
||||
class BertTokenizerFast(PreTrainedTokenizerFast):
|
||||
r"""
|
||||
Constructs a "Fast" BERT tokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
|
||||
Bert tokenization is Based on WordPiece.
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
|
||||
should refer to the superclass for more information regarding methods.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`string`):
|
||||
File containing the vocabulary.
|
||||
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to lowercase the input when tokenizing.
|
||||
unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
|
||||
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
|
||||
for sequence classification or for a text and a question for question answering.
|
||||
It is also used as the last token of a sequence built with special tokens.
|
||||
pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
|
||||
The classifier token which is used when doing sequence classification (classification of the whole
|
||||
sequence instead of per-token classification). It is the first token of the sequence when built with
|
||||
special tokens.
|
||||
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
|
||||
The token used for masking values. This is the token used when training this model with masked language
|
||||
modeling. This is the token which the model will try to predict.
|
||||
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to tokenize Chinese characters.
|
||||
This should likely be deactivated for Japanese:
|
||||
see: https://github.com/huggingface/transformers/issues/328
|
||||
clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to clean the text before tokenization by removing any control characters and
|
||||
replacing all whitespaces by the classic one.
|
||||
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to tokenize Chinese characters.
|
||||
This should likely be deactivated for Japanese:
|
||||
see: https://github.com/huggingface/transformers/issues/328
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
|
||||
|
||||
@@ -119,8 +119,6 @@ class BertJapaneseTokenizer(BertTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
# ^^ We call the grandparent's init, not the parent's.
|
||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
|
||||
|
||||
if not os.path.isfile(vocab_file):
|
||||
raise ValueError(
|
||||
|
||||
@@ -129,8 +129,6 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model.Load(str(vocab_file))
|
||||
self.vocab_file = vocab_file
|
||||
|
||||
@@ -140,12 +140,6 @@ class CTRLTokenizer(PreTrainedTokenizer):
|
||||
|
||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
||||
super().__init__(unk_token=unk_token, **kwargs)
|
||||
self.max_len_single_sentence = (
|
||||
self.max_len
|
||||
) # no default special tokens - you can update this value if you add special tokens
|
||||
self.max_len_sentences_pair = (
|
||||
self.max_len
|
||||
) # no default special tokens - you can update this value if you add special tokens
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
|
||||
@@ -57,8 +57,9 @@ PRETRAINED_INIT_CONFIGURATION = {
|
||||
|
||||
class DistilBertTokenizer(BertTokenizer):
|
||||
r"""
|
||||
Constructs a DistilBertTokenizer.
|
||||
:class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
|
||||
Constructs a DistilBertTokenizer.
|
||||
|
||||
:class:`~transformers.DistilBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
|
||||
tokenization: punctuation splitting + wordpiece.
|
||||
|
||||
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
|
||||
@@ -73,6 +74,16 @@ class DistilBertTokenizer(BertTokenizer):
|
||||
|
||||
|
||||
class DistilBertTokenizerFast(BertTokenizerFast):
|
||||
r"""
|
||||
Constructs a "Fast" DistilBertTokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
|
||||
:class:`~transformers.DistilBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
|
||||
tokenization: punctuation splitting + wordpiece.
|
||||
|
||||
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
|
||||
parameters.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
|
||||
@@ -67,7 +67,8 @@ class ElectraTokenizer(BertTokenizer):
|
||||
|
||||
class ElectraTokenizerFast(BertTokenizerFast):
|
||||
r"""
|
||||
Constructs an Electra Fast tokenizer.
|
||||
Constructs a "Fast" Electra Fast tokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
|
||||
:class:`~transformers.ElectraTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
|
||||
tokenization: punctuation splitting + wordpiece.
|
||||
|
||||
|
||||
@@ -147,12 +147,6 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
|
||||
self.max_len_single_sentence = (
|
||||
self.max_len
|
||||
) # no default special tokens - you can update this value if you add special tokens
|
||||
self.max_len_sentences_pair = (
|
||||
self.max_len
|
||||
) # no default special tokens - you can update this value if you add special tokens
|
||||
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
@@ -284,6 +278,47 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
|
||||
class GPT2TokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Constructs a "Fast" GPT-2 BPE tokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
|
||||
Peculiarities:
|
||||
|
||||
- Byte-level Byte-Pair-Encoding
|
||||
- Requires a space to start the input string => the encoding methods should be called with the
|
||||
``add_prefix_space`` flag set to ``True``.
|
||||
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
|
||||
the absence of a space at the beginning of a string:
|
||||
|
||||
::
|
||||
|
||||
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
|
||||
should refer to the superclass for more information regarding methods.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
Path to the vocabulary file.
|
||||
merges_file (:obj:`str`):
|
||||
Path to the merges file.
|
||||
errors (:obj:`str`, `optional`, defaults to "replace"):
|
||||
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
|
||||
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
|
||||
unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
|
||||
The beginning of sequence token.
|
||||
eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
|
||||
The end of sequence token.
|
||||
add_prefix_space (:obj:`bool`, `optional`, defaults to `False`):
|
||||
Whether to add a leading space to the first word.
|
||||
This allows to treat the leading word just as any other word.
|
||||
(GPT2 tokenizer detect beginning of words by the preceeding space)
|
||||
trim_offsets (:obj:`bool`, `optional`, defaults to `True`):
|
||||
Whether the post processing step should trim offsets to avoid including whitespaces.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
@@ -296,10 +331,16 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
|
||||
bos_token="<|endoftext|>",
|
||||
eos_token="<|endoftext|>",
|
||||
add_prefix_space=False,
|
||||
trim_offsets=True,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, add_prefix_space=add_prefix_space),
|
||||
ByteLevelBPETokenizer(
|
||||
vocab_file=vocab_file,
|
||||
merges_file=merges_file,
|
||||
add_prefix_space=add_prefix_space,
|
||||
trim_offsets=trim_offsets,
|
||||
),
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
|
||||
@@ -19,15 +19,8 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import List, Optional, Union
|
||||
|
||||
from tokenizers import Tokenizer
|
||||
from tokenizers.decoders import BPEDecoder
|
||||
from tokenizers.implementations import BaseTokenizer
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import BertNormalizer, Sequence, unicode_normalizer_from_str
|
||||
from tokenizers.pre_tokenizers import BertPreTokenizer
|
||||
from tokenizers.trainers import BpeTrainer
|
||||
from tokenizers import CharBPETokenizer
|
||||
|
||||
from .tokenization_bert import BasicTokenizer
|
||||
from .tokenization_utils import PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||
@@ -106,13 +99,6 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
||||
super().__init__(unk_token=unk_token, **kwargs)
|
||||
|
||||
self.max_len_single_sentence = (
|
||||
self.max_len
|
||||
) # no default special tokens - you can update this value if you add special tokens
|
||||
self.max_len_sentences_pair = (
|
||||
self.max_len
|
||||
) # no default special tokens - you can update this value if you add special tokens
|
||||
|
||||
try:
|
||||
import ftfy
|
||||
from spacy.lang.en import English
|
||||
@@ -249,83 +235,28 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
||||
return vocab_file, merge_file
|
||||
|
||||
|
||||
class _OpenAIGPTCharBPETokenizer(BaseTokenizer):
|
||||
"""
|
||||
OpenAI character-level BPE Tokenizer
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
merges_file: Optional[str] = None,
|
||||
unk_token: Optional[str] = "<unk>",
|
||||
suffix: Optional[str] = "</w>",
|
||||
dropout: Optional[float] = None,
|
||||
unicode_normalizer: Optional[str] = None,
|
||||
):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)
|
||||
)
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE())
|
||||
|
||||
# Check for Unicode normalization first (before everything else)
|
||||
normalizers = []
|
||||
|
||||
if unicode_normalizer:
|
||||
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
|
||||
|
||||
# OpenAI normalization is the same as Bert
|
||||
normalizers += [BertNormalizer()]
|
||||
|
||||
# Create the normalizer structure
|
||||
if len(normalizers) > 0:
|
||||
if len(normalizers) > 1:
|
||||
tokenizer.normalizer = Sequence(normalizers)
|
||||
else:
|
||||
tokenizer.normalizer = normalizers[0]
|
||||
|
||||
tokenizer.pre_tokenizer = BertPreTokenizer()
|
||||
tokenizer.decoder = BPEDecoder(suffix=suffix)
|
||||
|
||||
parameters = {
|
||||
"model": "BPE",
|
||||
"unk_token": unk_token,
|
||||
"suffix": suffix,
|
||||
"dropout": dropout,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
vocab_size: int = 30000,
|
||||
min_frequency: int = 2,
|
||||
special_tokens: List[str] = ["<unk>"],
|
||||
limit_alphabet: int = 1000,
|
||||
initial_alphabet: List[str] = [],
|
||||
suffix: Optional[str] = "</w>",
|
||||
show_progress: bool = True,
|
||||
):
|
||||
""" Train the model using the given files """
|
||||
|
||||
trainer = BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
min_frequency=min_frequency,
|
||||
special_tokens=special_tokens,
|
||||
limit_alphabet=limit_alphabet,
|
||||
initial_alphabet=initial_alphabet,
|
||||
end_of_word_suffix=suffix,
|
||||
show_progress=show_progress,
|
||||
)
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(trainer, files)
|
||||
|
||||
|
||||
class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Construct a "Fast" BPE tokenizer for OpenAI GPT (backed by HuggingFace's `tokenizers` library).
|
||||
|
||||
Peculiarities:
|
||||
|
||||
- lower case all inputs
|
||||
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
||||
should refer to the superclass for more information regarding methods.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
Path to the vocabulary file.
|
||||
merges_file (:obj:`str`):
|
||||
Path to the merges file.
|
||||
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
@@ -333,5 +264,6 @@ class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
|
||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
||||
kwargs.setdefault("unk_token", unk_token)
|
||||
super().__init__(
|
||||
_OpenAIGPTCharBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, unk_token=unk_token), **kwargs
|
||||
CharBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, unk_token=unk_token, lowercase=True),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -150,8 +150,6 @@ class RobertaTokenizer(GPT2Tokenizer):
|
||||
mask_token=mask_token,
|
||||
**kwargs,
|
||||
)
|
||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
|
||||
|
||||
def build_inputs_with_special_tokens(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
@@ -244,6 +242,47 @@ class RobertaTokenizer(GPT2Tokenizer):
|
||||
|
||||
|
||||
class RobertaTokenizerFast(GPT2TokenizerFast):
|
||||
"""
|
||||
Constructs a "Fast" RoBERTa BPE tokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
|
||||
Peculiarities:
|
||||
|
||||
- Byte-level Byte-Pair-Encoding
|
||||
- Requires a space to start the input string => the encoding methods should be called with the
|
||||
``add_prefix_space`` flag set to ``True``.
|
||||
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
|
||||
the absence of a space at the beginning of a string:
|
||||
|
||||
::
|
||||
|
||||
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
|
||||
should refer to the superclass for more information regarding methods.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
Path to the vocabulary file.
|
||||
merges_file (:obj:`str`):
|
||||
Path to the merges file.
|
||||
errors (:obj:`str`, `optional`, defaults to "replace"):
|
||||
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
|
||||
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
|
||||
unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
|
||||
The beginning of sequence token.
|
||||
eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
|
||||
The end of sequence token.
|
||||
add_prefix_space (:obj:`bool`, `optional`, defaults to `False`):
|
||||
Whether to add a leading space to the first word.
|
||||
This allows to treat the leading word just as any other word.
|
||||
(GPT2 tokenizer detect beginning of words by the preceeding space)
|
||||
trim_offsets (:obj:`bool`, `optional`, defaults to `True`):
|
||||
Whether the post processing step should trim offsets to avoid including whitespaces.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
@@ -262,6 +301,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
|
||||
pad_token="<pad>",
|
||||
mask_token="<mask>",
|
||||
add_prefix_space=True,
|
||||
trim_offsets=True,
|
||||
**kwargs
|
||||
):
|
||||
kwargs.setdefault("pad_token", pad_token)
|
||||
@@ -276,23 +316,18 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
add_prefix_space=add_prefix_space,
|
||||
trim_offsets=trim_offsets,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.tokenizer._tokenizer.post_processor = RobertaProcessing(
|
||||
(sep_token, self.sep_token_id), (cls_token, self.cls_token_id)
|
||||
self.backend_tokenizer._tokenizer.post_processor = RobertaProcessing(
|
||||
sep=(sep_token, self.sep_token_id),
|
||||
cls=(cls_token, self.cls_token_id),
|
||||
add_prefix_space=add_prefix_space,
|
||||
trim_offsets=trim_offsets,
|
||||
)
|
||||
|
||||
self.tokenizer.add_special_tokens([kwargs["mask_token"]])
|
||||
|
||||
# As we override the post_processor post super.__init__ the computed num_added_tokens is wrong in super().
|
||||
# We need to recompute max_len according to the newly register post_processor to get real values.
|
||||
self.max_len_single_sentence = self.max_len - self.num_special_tokens_to_add(
|
||||
False
|
||||
) # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - self.num_special_tokens_to_add(
|
||||
True
|
||||
) # take into account special tokens
|
||||
self.backend_tokenizer.add_special_tokens([kwargs["mask_token"]])
|
||||
|
||||
@PreTrainedTokenizer.mask_token.setter
|
||||
def mask_token(self, value):
|
||||
@@ -300,7 +335,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
|
||||
value = AddedToken(value, lstrip=True)
|
||||
|
||||
self._mask_token = str(value)
|
||||
self.tokenizer.add_special_tokens([value])
|
||||
self._maybe_update_backend([value])
|
||||
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
|
||||
|
||||
@@ -118,12 +118,6 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
**kwargs,
|
||||
)
|
||||
self.max_len_single_sentence = (
|
||||
self.max_len
|
||||
) # no default special tokens - you can update this value if you add special tokens
|
||||
self.max_len_sentences_pair = (
|
||||
self.max_len
|
||||
) # no default special tokens - you can update this value if you add special tokens
|
||||
|
||||
try:
|
||||
import sentencepiece as spm
|
||||
|
||||
@@ -101,13 +101,6 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs
|
||||
)
|
||||
|
||||
self.max_len_single_sentence = (
|
||||
self.max_len
|
||||
) # no default special tokens - you can update this value if you add special tokens
|
||||
self.max_len_sentences_pair = (
|
||||
self.max_len
|
||||
) # no default special tokens - you can update this value if you add special tokens
|
||||
|
||||
if never_split is None:
|
||||
never_split = self.all_special_tokens
|
||||
if special is None:
|
||||
@@ -410,6 +403,16 @@ class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer):
|
||||
|
||||
|
||||
class TransfoXLTokenizerFast(PreTrainedTokenizerFast):
|
||||
"""
|
||||
Construct a "Fast" Transformer-XL tokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
|
||||
The Transformer-XL tokenizer is a word-level tokenizer (no sub-word tokenization).
|
||||
|
||||
Adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
|
||||
should refer to the superclass for more information regarding methods.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES_FAST
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP_FAST
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -629,9 +629,6 @@ class XLMTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
|
||||
|
||||
# cache of sm.MosesPunctNormalizer instance
|
||||
self.cache_moses_punct_normalizer = dict()
|
||||
# cache of sm.MosesTokenizer instance
|
||||
|
||||
@@ -128,8 +128,6 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
||||
mask_token=mask_token,
|
||||
**kwargs,
|
||||
)
|
||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
|
||||
|
||||
try:
|
||||
import sentencepiece as spm
|
||||
|
||||
@@ -138,8 +138,6 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
|
||||
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
|
||||
self._pad_token_type_id = 3
|
||||
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user