Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove Transfo-XL fast tokenizer (#7141)

* [WIP] SP tokenizers

* fixing tests for T5

* WIP tokenizers

* serialization

* update T5

* WIP T5 tokenization

* slow to fast conversion script

* Refactoring to move tokenzier implementations inside transformers

* Adding gpt - refactoring - quality

* WIP adding several tokenizers to the fast world

* WIP Roberta - moving implementations

* update to dev4 switch file loading to in-memory loading

* Updating and fixing

* advancing on the tokenizers - updating do_lower_case

* style and quality

* moving forward with tokenizers conversion and tests

* MBart, T5

* dumping the fast version of transformer XL

* Adding to autotokenizers + style/quality

* update init and space_between_special_tokens

* style and quality

* bump up tokenizers version

* add protobuf

* fix pickle Bert JP with Mecab

* fix newly added tokenizers

* style and quality

* fix bert japanese

* fix funnel

* limite tokenizer warning to one occurence

* clean up file

* fix new tokenizers

* fast tokenizers deep tests

* WIP adding all the special fast tests on the new fast tokenizers

* quick fix

* adding more fast tokenizers in the fast tests

* all tokenizers in fast version tested

* Adding BertGenerationFast

* bump up setup.py for CI

* remove BertGenerationFast (too early)

* bump up tokenizers version

* Clean old docstrings

* Typo

* Update following Lysandre comments

Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
This commit is contained in:
Thomas Wolf
2020-10-08 11:32:16 +02:00
committed by GitHub
parent 4d04120c6d
commit 9aeacb58ba
60 changed files with 4663 additions and 1207 deletions

View File

@@ -46,13 +46,6 @@ TransfoXLTokenizer
:members: save_vocabulary :members: save_vocabulary
TransfoXLTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TransfoXLTokenizerFast
:members:
TransfoXL specific outputs TransfoXL specific outputs
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -111,7 +111,7 @@ setup(
packages=find_packages("src"), packages=find_packages("src"),
install_requires=[ install_requires=[
"numpy", "numpy",
"tokenizers == 0.8.1.rc2", "tokenizers == 0.9.0.rc2",
# dataclasses for Python versions that don't have it # dataclasses for Python versions that don't have it
"dataclasses;python_version<'3.7'", "dataclasses;python_version<'3.7'",
# utilities from PyPA to e.g. compare versions # utilities from PyPA to e.g. compare versions
@@ -124,8 +124,9 @@ setup(
"tqdm >= 4.27", "tqdm >= 4.27",
# for OpenAI GPT # for OpenAI GPT
"regex != 2019.12.17", "regex != 2019.12.17",
# for XLNet # for SentencePiece models
"sentencepiece != 0.1.92", "sentencepiece != 0.1.92",
"protobuf",
# for XLM # for XLM
"sacremoses", "sacremoses",
], ],

View File

@@ -152,7 +152,7 @@ from .pipelines import (
from .retrieval_rag import RagRetriever from .retrieval_rag import RagRetriever
# Tokenizers # Tokenizers
from .tokenization_albert import AlbertTokenizer from .tokenization_albert import AlbertTokenizer, AlbertTokenizerFast
from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
from .tokenization_bart import BartTokenizer, BartTokenizerFast from .tokenization_bart import BartTokenizer, BartTokenizerFast
from .tokenization_bert import BasicTokenizer, BertTokenizer, BertTokenizerFast, WordpieceTokenizer from .tokenization_bert import BasicTokenizer, BertTokenizer, BertTokenizerFast, WordpieceTokenizer
@@ -160,7 +160,7 @@ from .tokenization_bert_generation import BertGenerationTokenizer
from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
from .tokenization_bertweet import BertweetTokenizer from .tokenization_bertweet import BertweetTokenizer
from .tokenization_blenderbot import BlenderbotSmallTokenizer, BlenderbotTokenizer from .tokenization_blenderbot import BlenderbotSmallTokenizer, BlenderbotTokenizer
from .tokenization_camembert import CamembertTokenizer from .tokenization_camembert import CamembertTokenizer, CamembertTokenizerFast
from .tokenization_ctrl import CTRLTokenizer from .tokenization_ctrl import CTRLTokenizer
from .tokenization_deberta import DebertaTokenizer from .tokenization_deberta import DebertaTokenizer
from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
@@ -180,18 +180,18 @@ from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
from .tokenization_layoutlm import LayoutLMTokenizer, LayoutLMTokenizerFast from .tokenization_layoutlm import LayoutLMTokenizer, LayoutLMTokenizerFast
from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast
from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
from .tokenization_mbart import MBartTokenizer from .tokenization_mbart import MBartTokenizer, MBartTokenizerFast
from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFast from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFast
from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
from .tokenization_pegasus import PegasusTokenizer from .tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast
from .tokenization_phobert import PhobertTokenizer from .tokenization_phobert import PhobertTokenizer
from .tokenization_rag import RagTokenizer from .tokenization_rag import RagTokenizer
from .tokenization_reformer import ReformerTokenizer from .tokenization_reformer import ReformerTokenizer, ReformerTokenizerFast
from .tokenization_retribert import RetriBertTokenizer, RetriBertTokenizerFast from .tokenization_retribert import RetriBertTokenizer, RetriBertTokenizerFast
from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
from .tokenization_squeezebert import SqueezeBertTokenizer, SqueezeBertTokenizerFast from .tokenization_squeezebert import SqueezeBertTokenizer, SqueezeBertTokenizerFast
from .tokenization_t5 import T5Tokenizer from .tokenization_t5 import T5Tokenizer, T5TokenizerFast
from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer, TransfoXLTokenizerFast from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_base import ( from .tokenization_utils_base import (
BatchEncoding, BatchEncoding,
@@ -203,8 +203,8 @@ from .tokenization_utils_base import (
) )
from .tokenization_utils_fast import PreTrainedTokenizerFast from .tokenization_utils_fast import PreTrainedTokenizerFast
from .tokenization_xlm import XLMTokenizer from .tokenization_xlm import XLMTokenizer
from .tokenization_xlm_roberta import XLMRobertaTokenizer from .tokenization_xlm_roberta import XLMRobertaTokenizer, XLMRobertaTokenizerFast
from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast
# Trainer # Trainer
from .trainer_callback import ( from .trainer_callback import (

View File

@@ -0,0 +1,566 @@
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Utilities to convert slow tokenizers in their fast tokenizers counterparts.
All the conversions are grouped here to gather SentencePiece dependencies outside of
the fast tokenizers files and allow to make our dependency on SentencePiece optional.
"""
from typing import Dict, List, Tuple
from sentencepiece import SentencePieceProcessor
from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import BPE, Unigram, WordPiece
# from transformers.tokenization_openai import OpenAIGPTTokenizer
from transformers.utils import sentencepiece_model_pb2 as model
class SentencePieceExtractor:
"""
Extractor implementation for SentencePiece trained models.
https://github.com/google/sentencepiece
"""
def __init__(self, model: str):
# Get SentencePiece
self.sp = SentencePieceProcessor()
self.sp.Load(model)
def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
sp = self.sp
vocab = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())}
# Merges
merges = []
for piece_l in vocab.keys():
for piece_r in vocab.keys():
merge = f"{piece_l}{piece_r}"
piece_id = vocab.get(merge, None)
if piece_id:
merges += [(piece_l, piece_r, piece_id)]
merges = sorted(merges, key=lambda val: val[2])
merges = [(val[0], val[1]) for val in merges]
return vocab, merges
def check_number_comma(piece: str) -> bool:
return len(piece) < 2 or piece[-1] != "," or not piece[-2].isdigit()
def get_proto(filename: str):
m = model.ModelProto()
m.ParseFromString(open(filename, "rb").read())
return m
class Converter:
def __init__(self, original_tokenizer):
self.original_tokenizer = original_tokenizer
def converted(self) -> Tokenizer:
raise NotImplementedError()
class BertConverter(Converter):
def converted(self) -> Tokenizer:
vocab = self.original_tokenizer.vocab
tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))
# # Let the tokenizer know about special tokens if they are part of the vocab
# if tokenizer.token_to_id(str(self.original_tokenizer.unk_token)) is not None:
# tokenizer.add_special_tokens([str(self.original_tokenizer.unk_token)])
# if tokenizer.token_to_id(str(self.original_tokenizer.sep_token)) is not None:
# tokenizer.add_special_tokens([str(self.original_tokenizer.sep_token)])
# if tokenizer.token_to_id(str(self.original_tokenizer.cls_token)) is not None:
# tokenizer.add_special_tokens([str(self.original_tokenizer.cls_token)])
# if tokenizer.token_to_id(str(self.original_tokenizer.pad_token)) is not None:
# tokenizer.add_special_tokens([str(self.original_tokenizer.pad_token)])
# if tokenizer.token_to_id(str(self.original_tokenizer.mask_token)) is not None:
# tokenizer.add_special_tokens([str(self.original_tokenizer.mask_token)])
tokenize_chinese_chars = False
strip_accents = False
do_lower_case = False
if hasattr(self.original_tokenizer, "basic_tokenizer"):
tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case
tokenizer.normalizer = normalizers.BertNormalizer(
clean_text=True,
handle_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
lowercase=do_lower_case,
)
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
cls = str(self.original_tokenizer.cls_token)
sep = str(self.original_tokenizer.sep_token)
cls_token_id = self.original_tokenizer.cls_token_id
sep_token_id = self.original_tokenizer.sep_token_id
tokenizer.post_processor = processors.TemplateProcessing(
single=f"{cls}:0 $A:0 {sep}:0",
pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1",
special_tokens=[
(cls, cls_token_id),
(sep, sep_token_id),
],
)
tokenizer.decoder = decoders.WordPiece(prefix="##")
return tokenizer
class FunnelConverter(Converter):
def converted(self) -> Tokenizer:
vocab = self.original_tokenizer.vocab
tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))
# # Let the tokenizer know about special tokens if they are part of the vocab
# if tokenizer.token_to_id(str(self.original_tokenizer.unk_token)) is not None:
# tokenizer.add_special_tokens([str(self.original_tokenizer.unk_token)])
# if tokenizer.token_to_id(str(self.original_tokenizer.sep_token)) is not None:
# tokenizer.add_special_tokens([str(self.original_tokenizer.sep_token)])
# if tokenizer.token_to_id(str(self.original_tokenizer.cls_token)) is not None:
# tokenizer.add_special_tokens([str(self.original_tokenizer.cls_token)])
# if tokenizer.token_to_id(str(self.original_tokenizer.pad_token)) is not None:
# tokenizer.add_special_tokens([str(self.original_tokenizer.pad_token)])
# if tokenizer.token_to_id(str(self.original_tokenizer.mask_token)) is not None:
# tokenizer.add_special_tokens([str(self.original_tokenizer.mask_token)])
tokenize_chinese_chars = False
strip_accents = False
do_lower_case = False
if hasattr(self.original_tokenizer, "basic_tokenizer"):
tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case
tokenizer.normalizer = normalizers.BertNormalizer(
clean_text=True,
handle_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
lowercase=do_lower_case,
)
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
cls = str(self.original_tokenizer.cls_token)
sep = str(self.original_tokenizer.sep_token)
cls_token_id = self.original_tokenizer.cls_token_id
sep_token_id = self.original_tokenizer.sep_token_id
tokenizer.post_processor = processors.TemplateProcessing(
single=f"{cls}:2 $A:0 {sep}:0", # token_type_id is 2 for Funnel transformer
pair=f"{cls}:2 $A:0 {sep}:0 $B:1 {sep}:1",
special_tokens=[
(cls, cls_token_id),
(sep, sep_token_id),
],
)
tokenizer.decoder = decoders.WordPiece(prefix="##")
return tokenizer
class OpenAIGPTConverter(Converter):
def converted(self) -> Tokenizer:
vocab = self.original_tokenizer.encoder
merges = list(self.original_tokenizer.bpe_ranks.keys())
unk_token = self.original_tokenizer.unk_token
tokenizer = Tokenizer(
BPE(
vocab=vocab,
merges=merges,
dropout=None,
unk_token=str(unk_token),
end_of_word_suffix="</w>",
fuse_unk=False,
)
)
if tokenizer.token_to_id(str(unk_token)) is not None:
tokenizer.add_special_tokens([str(unk_token)])
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tokenizer.decoder = decoders.BPEDecoder(suffix="</w>")
return tokenizer
class GPT2Converter(Converter):
def converted(self) -> Tokenizer:
vocab = self.original_tokenizer.encoder
merges = list(self.original_tokenizer.bpe_ranks.keys())
tokenizer = Tokenizer(
BPE(
vocab=vocab,
merges=merges,
dropout=None,
continuing_subword_prefix="",
end_of_word_suffix="",
fuse_unk=False,
)
)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=self.original_tokenizer.add_prefix_space)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
return tokenizer
class RobertaConverter(Converter):
def converted(self) -> Tokenizer:
ot = self.original_tokenizer
vocab = ot.encoder
merges = list(ot.bpe_ranks.keys())
tokenizer = Tokenizer(
BPE(
vocab=vocab,
merges=merges,
dropout=None,
continuing_subword_prefix="",
end_of_word_suffix="",
fuse_unk=False,
)
)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.RobertaProcessing(
sep=(ot.sep_token, ot.sep_token_id),
cls=(ot.cls_token, ot.cls_token_id),
add_prefix_space=ot.add_prefix_space,
trim_offsets=True, # True by default on Roberta (historical)
)
return tokenizer
class SpmConverter(Converter):
def __init__(self, *args):
super().__init__(*args)
self.proto = get_proto(self.original_tokenizer.vocab_file)
def vocab(self, proto):
return [(piece.piece, piece.score) for piece in proto.pieces]
def unk_id(self, proto):
return proto.trainer_spec.unk_id
def tokenizer(self, proto):
model_type = proto.trainer_spec.model_type
vocab = self.vocab(proto)
unk_id = self.unk_id(proto)
if model_type == 1:
tokenizer = Tokenizer(Unigram(vocab, unk_id))
elif model_type == 2:
vocab, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
tokenizer = Tokenizer(
BPE(
vocab,
merges,
unk_token=proto.trainer_spec.unk_piece,
fuse_unk=True,
)
)
else:
raise Exception(
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
)
return tokenizer
def normalizer(self, proto):
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
return normalizers.Precompiled(precompiled_charsmap)
def post_processor(self):
return None
def converted(self) -> Tokenizer:
tokenizer = self.tokenizer(self.proto)
# Tokenizer assemble
tokenizer.normalizer = self.normalizer(self.proto)
replacement = ""
add_prefix_space = True
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.WhitespaceSplit(),
pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
]
)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
post_processor = self.post_processor()
if post_processor:
tokenizer.post_processor = post_processor
return tokenizer
class AlbertConverter(SpmConverter):
def vocab(self, proto):
return [
(piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
for piece in proto.pieces
]
def normalizer(self, proto):
list_normalizers = [normalizers.Replace("``", '"'), normalizers.Replace("''", '"')]
if not self.original_tokenizer.keep_accents:
list_normalizers.append(normalizers.NFKD())
list_normalizers.append(normalizers.StripAccents())
if self.original_tokenizer.do_lower_case:
list_normalizers.append(normalizers.Lowercase())
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
return normalizers.Sequence(list_normalizers)
def post_processor(self):
return processors.TemplateProcessing(
single="[CLS]:0 $A:0 [SEP]:0",
pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
special_tokens=[
("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")),
("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")),
],
)
class CamembertConverter(SpmConverter):
def vocab(self, proto):
vocab = [
("<s>NOTUSED", 0.0),
("<pad>", 0.0),
("</s>NOTUSED", 0.0),
("<unk>", 0.0),
]
# We down-grade the original SentencePiece by -100 to avoid using it and use our added token instead
vocab += [(piece.piece, piece.score if i != 0 else piece.score - 100) for i, piece in enumerate(proto.pieces)]
vocab += [("<mask>", 0.0)]
return vocab
def unk_id(self, proto):
# See vocab unk position
return 3
def post_processor(self):
return processors.TemplateProcessing(
single="<s> $A </s>",
pair="<s> $A </s> </s> $B </s>",
special_tokens=[
("<s>", self.original_tokenizer.convert_tokens_to_ids("<s>")),
("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
],
)
class MBartConverter(SpmConverter):
def vocab(self, proto):
vocab = [
("<s>", 0.0),
("<pad>", 0.0),
("</s>", 0.0),
("<unk>", 0.0),
]
vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
vocab += [
("ar_AR", 0.0),
("cs_CZ", 0.0),
("de_DE", 0.0),
("en_XX", 0.0),
("es_XX", 0.0),
("et_EE", 0.0),
("fi_FI", 0.0),
("fr_XX", 0.0),
("gu_IN", 0.0),
("hi_IN", 0.0),
("it_IT", 0.0),
("ja_XX", 0.0),
("kk_KZ", 0.0),
("ko_KR", 0.0),
("lt_LT", 0.0),
("lv_LV", 0.0),
("my_MM", 0.0),
("ne_NP", 0.0),
("nl_XX", 0.0),
("ro_RO", 0.0),
("ru_RU", 0.0),
("si_LK", 0.0),
("tr_TR", 0.0),
("vi_VN", 0.0),
("zh_CN", 0.0),
]
vocab += [("<mask>", 0.0)]
return vocab
def unk_id(self, proto):
return 3
def post_processor(self):
return processors.TemplateProcessing(
single="$A </s> en_XX",
pair="$A $B </s> en_XX",
special_tokens=[
("en_XX", self.original_tokenizer.convert_tokens_to_ids("en_XX")),
("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
],
)
class XLMRobertaConverter(SpmConverter):
def vocab(self, proto):
vocab = [
("<s>", 0.0),
("<pad>", 0.0),
("</s>", 0.0),
("<unk>", 0.0),
]
vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
vocab += [("<mask>", 0.0)]
return vocab
def unk_id(self, proto):
unk_id = 3
return unk_id
def post_processor(self):
return processors.TemplateProcessing(
single="<s> $A </s>",
pair="<s> $A </s> </s> $B </s>",
special_tokens=[
("<s>", self.original_tokenizer.convert_tokens_to_ids("<s>")),
("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
],
)
class XLNetConverter(SpmConverter):
def vocab(self, proto):
return [
(piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100)
for piece in proto.pieces
]
def normalizer(self, proto):
list_normalizers = [normalizers.Replace("``", '"'), normalizers.Replace("''", '"')]
if not self.original_tokenizer.keep_accents:
list_normalizers.append(normalizers.NFKD())
list_normalizers.append(normalizers.StripAccents())
if self.original_tokenizer.do_lower_case:
list_normalizers.append(normalizers.Lowercase())
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
list_normalizers.append(normalizers.Precompiled(precompiled_charsmap))
return normalizers.Sequence(list_normalizers)
def post_processor(self):
return processors.TemplateProcessing(
single="$A:0 <sep>:0 <cls>:2",
pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
special_tokens=[
("<sep>", self.original_tokenizer.convert_tokens_to_ids("<sep>")),
("<cls>", self.original_tokenizer.convert_tokens_to_ids("<cls>")),
],
)
class ReformerConverter(SpmConverter):
pass
class BertGenerationConverter(SpmConverter):
pass
class PegasusConverter(SpmConverter):
def vocab(self, proto):
vocab = [
(self.original_tokenizer.pad_token, 0),
(self.original_tokenizer.eos_token, 0),
]
vocab += [(f"unk_{i}", -100) for i in range(2, 2 + self.original_tokenizer.offset)]
vocab += [(piece.piece, piece.score) for piece in proto.pieces[2:]]
return vocab
def unk_id(self, proto):
return proto.trainer_spec.unk_id + self.original_tokenizer.offset
def post_processor(self):
eos = self.original_tokenizer.eos_token
return processors.TemplateProcessing(
single=["$A", eos],
pair=["$A", "$B", eos],
special_tokens=[
(eos, self.original_tokenizer.eos_token_id),
],
)
class T5Converter(SpmConverter):
def vocab(self, proto):
num_extra_ids = self.original_tokenizer._extra_ids
vocab = [(piece.piece, piece.score) for piece in proto.pieces]
vocab += [("<extra_id_{}>".format(i), 0.0) for i in range(num_extra_ids - 1, -1, -1)]
return vocab
def post_processor(self):
return processors.TemplateProcessing(
single=["$A", "</s>"],
pair=["$A", "</s>", "$B", "</s>"],
special_tokens=[
("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
],
)
CONVERTERS = {
"AlbertTokenizer": AlbertConverter,
"BertTokenizer": BertConverter,
"BertGenerationTokenizer": BertGenerationConverter,
"BartTokenizer": RobertaConverter,
"CamembertTokenizer": CamembertConverter,
"DistilBertTokenizer": BertConverter,
"DPRReaderTokenizer": BertConverter,
"DPRQuestionEncoderTokenizer": BertConverter,
"DPRContextEncoderTokenizer": BertConverter,
"FunnelTokenizer": FunnelConverter,
"GPT2Tokenizer": GPT2Converter,
"LxmertTokenizer": BertConverter,
"MBartTokenizer": MBartConverter,
"OpenAIGPTTokenizer": OpenAIGPTConverter,
"PegasusTokenizer": PegasusConverter,
"ReformerTokenizer": ReformerConverter,
"RobertaTokenizer": RobertaConverter,
"T5Tokenizer": T5Converter,
"XLMRobertaTokenizer": XLMRobertaConverter,
"XLNetTokenizer": XLNetConverter,
}
def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer:
converter_class = CONVERTERS[transformer_tokenizer.__class__.__name__]
return converter_class(transformer_tokenizer).converted()

View File

@@ -21,6 +21,7 @@ from shutil import copyfile
from typing import List, Optional from typing import List, Optional
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging from .utils import logging
@@ -340,3 +341,206 @@ class AlbertTokenizer(PreTrainedTokenizer):
copyfile(self.vocab_file, out_vocab_file) copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,) return (out_vocab_file,)
class AlbertTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on
`SentencePiece <https://github.com/google/sentencepiece>`__.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to lowercase the input when tokenizing.
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to keep accents when tokenizing.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = AlbertTokenizer
def __init__(
self,
vocab_file,
do_lower_case=True,
remove_space=True,
keep_accents=False,
bos_token="[CLS]",
eos_token="[SEP]",
unk_token="<unk>",
sep_token="[SEP]",
pad_token="<pad>",
cls_token="[CLS]",
mask_token="[MASK]",
**kwargs
):
super().__init__(
vocab_file,
do_lower_case=do_lower_case,
remove_space=remove_space,
keep_accents=keep_accents,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
**kwargs,
)
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An ALBERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
An ALBERT sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory):
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)

View File

@@ -56,14 +56,14 @@ from .configuration_auto import (
replace_list_option_in_docstrings, replace_list_option_in_docstrings,
) )
from .configuration_utils import PretrainedConfig from .configuration_utils import PretrainedConfig
from .tokenization_albert import AlbertTokenizer from .tokenization_albert import AlbertTokenizer, AlbertTokenizerFast
from .tokenization_bart import BartTokenizer, BartTokenizerFast from .tokenization_bart import BartTokenizer, BartTokenizerFast
from .tokenization_bert import BertTokenizer, BertTokenizerFast from .tokenization_bert import BertTokenizer, BertTokenizerFast
from .tokenization_bert_generation import BertGenerationTokenizer from .tokenization_bert_generation import BertGenerationTokenizer
from .tokenization_bert_japanese import BertJapaneseTokenizer from .tokenization_bert_japanese import BertJapaneseTokenizer
from .tokenization_bertweet import BertweetTokenizer from .tokenization_bertweet import BertweetTokenizer
from .tokenization_blenderbot import BlenderbotSmallTokenizer from .tokenization_blenderbot import BlenderbotSmallTokenizer
from .tokenization_camembert import CamembertTokenizer from .tokenization_camembert import CamembertTokenizer, CamembertTokenizerFast
from .tokenization_ctrl import CTRLTokenizer from .tokenization_ctrl import CTRLTokenizer
from .tokenization_deberta import DebertaTokenizer from .tokenization_deberta import DebertaTokenizer
from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
@@ -77,21 +77,21 @@ from .tokenization_layoutlm import LayoutLMTokenizer, LayoutLMTokenizerFast
from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast
from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
from .tokenization_marian import MarianTokenizer from .tokenization_marian import MarianTokenizer
from .tokenization_mbart import MBartTokenizer from .tokenization_mbart import MBartTokenizer, MBartTokenizerFast
from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFast from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFast
from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
from .tokenization_pegasus import PegasusTokenizer from .tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast
from .tokenization_phobert import PhobertTokenizer from .tokenization_phobert import PhobertTokenizer
from .tokenization_rag import RagTokenizer from .tokenization_rag import RagTokenizer
from .tokenization_reformer import ReformerTokenizer from .tokenization_reformer import ReformerTokenizer, ReformerTokenizerFast
from .tokenization_retribert import RetriBertTokenizer, RetriBertTokenizerFast from .tokenization_retribert import RetriBertTokenizer, RetriBertTokenizerFast
from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
from .tokenization_squeezebert import SqueezeBertTokenizer, SqueezeBertTokenizerFast from .tokenization_squeezebert import SqueezeBertTokenizer, SqueezeBertTokenizerFast
from .tokenization_t5 import T5Tokenizer from .tokenization_t5 import T5Tokenizer, T5TokenizerFast
from .tokenization_transfo_xl import TransfoXLTokenizer, TransfoXLTokenizerFast from .tokenization_transfo_xl import TransfoXLTokenizer
from .tokenization_xlm import XLMTokenizer from .tokenization_xlm import XLMTokenizer
from .tokenization_xlm_roberta import XLMRobertaTokenizer from .tokenization_xlm_roberta import XLMRobertaTokenizer, XLMRobertaTokenizerFast
from .tokenization_xlnet import XLNetTokenizer from .tokenization_xlnet import XLNetTokenizer, XLNetTokenizerFast
from .utils import logging from .utils import logging
@@ -101,14 +101,14 @@ logger = logging.get_logger(__name__)
TOKENIZER_MAPPING = OrderedDict( TOKENIZER_MAPPING = OrderedDict(
[ [
(RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)), (RetriBertConfig, (RetriBertTokenizer, RetriBertTokenizerFast)),
(T5Config, (T5Tokenizer, None)), (T5Config, (T5Tokenizer, T5TokenizerFast)),
(MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)), (MobileBertConfig, (MobileBertTokenizer, MobileBertTokenizerFast)),
(DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)), (DistilBertConfig, (DistilBertTokenizer, DistilBertTokenizerFast)),
(AlbertConfig, (AlbertTokenizer, None)), (AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)),
(CamembertConfig, (CamembertTokenizer, None)), (CamembertConfig, (CamembertTokenizer, CamembertTokenizerFast)),
(PegasusConfig, (PegasusTokenizer, None)), (PegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)),
(MBartConfig, (MBartTokenizer, None)), (MBartConfig, (MBartTokenizer, MBartTokenizerFast)),
(XLMRobertaConfig, (XLMRobertaTokenizer, None)), (XLMRobertaConfig, (XLMRobertaTokenizer, XLMRobertaTokenizerFast)),
(MarianConfig, (MarianTokenizer, None)), (MarianConfig, (MarianTokenizer, None)),
(BlenderbotConfig, (BlenderbotSmallTokenizer, None)), (BlenderbotConfig, (BlenderbotSmallTokenizer, None)),
(LongformerConfig, (LongformerTokenizer, None)), (LongformerConfig, (LongformerTokenizer, None)),
@@ -117,7 +117,7 @@ TOKENIZER_MAPPING = OrderedDict(
(RobertaConfig, (BertweetTokenizer, None)), (RobertaConfig, (BertweetTokenizer, None)),
(RobertaConfig, (PhobertTokenizer, None)), (RobertaConfig, (PhobertTokenizer, None)),
(RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)), (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
(ReformerConfig, (ReformerTokenizer, None)), (ReformerConfig, (ReformerTokenizer, ReformerTokenizerFast)),
(ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)), (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)),
(FunnelConfig, (FunnelTokenizer, FunnelTokenizerFast)), (FunnelConfig, (FunnelTokenizer, FunnelTokenizerFast)),
(LxmertConfig, (LxmertTokenizer, LxmertTokenizerFast)), (LxmertConfig, (LxmertTokenizer, LxmertTokenizerFast)),
@@ -127,15 +127,14 @@ TOKENIZER_MAPPING = OrderedDict(
(BertConfig, (BertTokenizer, BertTokenizerFast)), (BertConfig, (BertTokenizer, BertTokenizerFast)),
(OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)), (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)),
(GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)), (GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)),
(TransfoXLConfig, (TransfoXLTokenizer, TransfoXLTokenizerFast)), (TransfoXLConfig, (TransfoXLTokenizer, None)),
(XLNetConfig, (XLNetTokenizer, None)), (XLNetConfig, (XLNetTokenizer, XLNetTokenizerFast)),
(FlaubertConfig, (FlaubertTokenizer, None)), (FlaubertConfig, (FlaubertTokenizer, None)),
(XLMConfig, (XLMTokenizer, None)), (XLMConfig, (XLMTokenizer, None)),
(CTRLConfig, (CTRLTokenizer, None)), (CTRLConfig, (CTRLTokenizer, None)),
(FSMTConfig, (FSMTTokenizer, None)), (FSMTConfig, (FSMTTokenizer, None)),
(BertGenerationConfig, (BertGenerationTokenizer, None)), (BertGenerationConfig, (BertGenerationTokenizer, None)),
(DebertaConfig, (DebertaTokenizer, None)), (DebertaConfig, (DebertaTokenizer, None)),
(LayoutLMConfig, (LayoutLMTokenizer, None)),
(RagConfig, (RagTokenizer, None)), (RagConfig, (RagTokenizer, None)),
] ]
) )

View File

@@ -163,6 +163,7 @@ class BartTokenizerFast(RobertaTokenizerFast):
"vocab_file": {m: vocab_url for m in _all_bart_models}, "vocab_file": {m: vocab_url for m in _all_bart_models},
"merges_file": {m: merges_url for m in _all_bart_models}, "merges_file": {m: merges_url for m in _all_bart_models},
} }
slow_tokenizer_class = BartTokenizer
def prepare_seq2seq_batch( def prepare_seq2seq_batch(
self, self,

View File

@@ -20,8 +20,6 @@ import os
import unicodedata import unicodedata
from typing import List, Optional from typing import List, Optional
from tokenizers import BertWordPieceTokenizer
from .tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace from .tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from .tokenization_utils_fast import PreTrainedTokenizerFast from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging from .utils import logging
@@ -206,6 +204,10 @@ class BertTokenizer(PreTrainedTokenizer):
) )
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property @property
def vocab_size(self): def vocab_size(self):
return len(self.vocab) return len(self.vocab)
@@ -329,7 +331,7 @@ class BertTokenizer(PreTrainedTokenizer):
def save_vocabulary(self, vocab_path): def save_vocabulary(self, vocab_path):
""" """
Save the vocabulary (copy original file) and special tokens file to a directory. Save the vocabulary and special tokens file to a directory.
Args: Args:
vocab_path (:obj:`str`): vocab_path (:obj:`str`):
@@ -610,6 +612,7 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = BertTokenizer
def __init__( def __init__(
self, self,
@@ -620,31 +623,20 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
pad_token="[PAD]", pad_token="[PAD]",
cls_token="[CLS]", cls_token="[CLS]",
mask_token="[MASK]", mask_token="[MASK]",
clean_text=True,
tokenize_chinese_chars=True, tokenize_chinese_chars=True,
strip_accents=None, strip_accents=None,
wordpieces_prefix="##",
**kwargs **kwargs
): ):
super().__init__( super().__init__(
BertWordPieceTokenizer( vocab_file,
vocab_file=vocab_file, do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
clean_text=clean_text,
handle_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
lowercase=do_lower_case,
wordpieces_prefix=wordpieces_prefix,
),
unk_token=unk_token, unk_token=unk_token,
sep_token=sep_token, sep_token=sep_token,
pad_token=pad_token, pad_token=pad_token,
cls_token=cls_token, cls_token=cls_token,
mask_token=mask_token, mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs, **kwargs,
) )

View File

@@ -16,6 +16,7 @@
import collections import collections
import copy
import os import os
import unicodedata import unicodedata
from typing import Optional from typing import Optional
@@ -116,6 +117,13 @@ class BertJapaneseTokenizer(BertTokenizer):
pad_token=pad_token, pad_token=pad_token,
cls_token=cls_token, cls_token=cls_token,
mask_token=mask_token, mask_token=mask_token,
do_lower_case=do_lower_case,
do_word_tokenize=do_word_tokenize,
do_subword_tokenize=do_subword_tokenize,
word_tokenizer_type=word_tokenizer_type,
subword_tokenizer_type=subword_tokenizer_type,
never_split=never_split,
mecab_kwargs=mecab_kwargs,
**kwargs, **kwargs,
) )
# ^^ We call the grandparent's init, not the parent's. # ^^ We call the grandparent's init, not the parent's.
@@ -129,6 +137,10 @@ class BertJapaneseTokenizer(BertTokenizer):
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_word_tokenize = do_word_tokenize self.do_word_tokenize = do_word_tokenize
self.word_tokenizer_type = word_tokenizer_type
self.lower_case = do_lower_case
self.never_split = never_split
self.mecab_kwargs = copy.deepcopy(mecab_kwargs)
if do_word_tokenize: if do_word_tokenize:
if word_tokenizer_type == "basic": if word_tokenizer_type == "basic":
self.word_tokenizer = BasicTokenizer( self.word_tokenizer = BasicTokenizer(
@@ -142,6 +154,7 @@ class BertJapaneseTokenizer(BertTokenizer):
raise ValueError("Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type)) raise ValueError("Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
self.do_subword_tokenize = do_subword_tokenize self.do_subword_tokenize = do_subword_tokenize
self.subword_tokenizer_type = subword_tokenizer_type
if do_subword_tokenize: if do_subword_tokenize:
if subword_tokenizer_type == "wordpiece": if subword_tokenizer_type == "wordpiece":
self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
@@ -150,6 +163,23 @@ class BertJapaneseTokenizer(BertTokenizer):
else: else:
raise ValueError("Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type)) raise ValueError("Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
@property
def do_lower_case(self):
return self.lower_case
def __getstate__(self):
state = dict(self.__dict__)
if self.word_tokenizer_type == "mecab":
del state["word_tokenizer"]
return state
def __setstate__(self, state):
self.__dict__ = state
if self.word_tokenizer_type == "mecab":
self.word_tokenizer = MecabTokenizer(
do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {})
)
def _tokenize(self, text): def _tokenize(self, text):
if self.do_word_tokenize: if self.do_word_tokenize:
tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens) tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)

View File

@@ -129,7 +129,6 @@ class BertweetTokenizer(PreTrainedTokenizer):
**kwargs **kwargs
): ):
super().__init__( super().__init__(
max_len=128,
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
unk_token=unk_token, unk_token=unk_token,

View File

@@ -22,6 +22,7 @@ from typing import List, Optional
import sentencepiece as spm import sentencepiece as spm
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging from .utils import logging
@@ -36,7 +37,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
} }
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"camembert-base": None, "camembert-base": 512,
} }
SHARED_MODEL_IDENTIFIERS = [ SHARED_MODEL_IDENTIFIERS = [
@@ -118,7 +119,6 @@ class CamembertTokenizer(PreTrainedTokenizer):
**kwargs **kwargs
): ):
super().__init__( super().__init__(
max_len=512,
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
unk_token=unk_token, unk_token=unk_token,
@@ -223,6 +223,11 @@ class CamembertTokenizer(PreTrainedTokenizer):
def vocab_size(self): def vocab_size(self):
return len(self.fairseq_tokens_to_ids) + len(self.sp_model) return len(self.fairseq_tokens_to_ids) + len(self.sp_model)
def get_vocab(self):
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text): def _tokenize(self, text):
return self.sp_model.EncodeAsPieces(text) return self.sp_model.EncodeAsPieces(text)
@@ -284,3 +289,189 @@ class CamembertTokenizer(PreTrainedTokenizer):
copyfile(self.vocab_file, out_vocab_file) copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,) return (out_vocab_file,)
class CamembertTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
:class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `SentencePiece
<https://github.com/google/sentencepiece>`__.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
slow_tokenizer_class = CamembertTokenizer
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
**kwargs
):
super().__init__(
vocab_file,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An CamemBERT sequence has the following format:
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def save_vocabulary(self, save_directory):
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)

View File

@@ -87,3 +87,4 @@ class DistilBertTokenizerFast(BertTokenizerFast):
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"] model_input_names = ["attention_mask"]
slow_tokenizer_class = DistilBertTokenizer

View File

@@ -98,6 +98,7 @@ class DPRContextEncoderTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = DPRContextEncoderTokenizer
class DPRQuestionEncoderTokenizer(BertTokenizer): class DPRQuestionEncoderTokenizer(BertTokenizer):
@@ -132,6 +133,7 @@ class DPRQuestionEncoderTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = DPRQuestionEncoderTokenizer
DPRSpanPrediction = collections.namedtuple( DPRSpanPrediction = collections.namedtuple(
@@ -417,3 +419,4 @@ class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast):
max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"] model_input_names = ["attention_mask"]
slow_tokenizer_class = DPRReaderTokenizer

View File

@@ -80,3 +80,4 @@ class ElectraTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = ElectraTokenizer

View File

@@ -181,6 +181,7 @@ class FSMTTokenizer(PreTrainedTokenizer):
**kwargs **kwargs
): ):
super().__init__( super().__init__(
langs=langs,
unk_token=unk_token, unk_token=unk_token,
bos_token=bos_token, bos_token=bos_token,
sep_token=sep_token, sep_token=sep_token,

View File

@@ -152,6 +152,7 @@ class FunnelTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = FunnelTokenizer
cls_token_type_id: int = 2 cls_token_type_id: int = 2
def __init__( def __init__(
@@ -217,16 +218,3 @@ class FunnelTokenizerFast(BertTokenizerFast):
if token_ids_1 is None: if token_ids_1 is None:
return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def _convert_encoding(self, encoding, **kwargs):
# The fast tokenizer doesn't use the function above so we fix the cls token type id when decoding the fast
# tokenzier output.
encoding_dict = super()._convert_encoding(encoding, **kwargs)
if "token_type_ids" in encoding_dict:
# Note: we can't assume the <cls> token is in first position because left padding is a thing, hence the
# double list comprehension.
encoding_dict["token_type_ids"] = [
[self.cls_token_type_id if i == self.cls_token_id else t for i, t in zip(input_ids, type_ids)]
for input_ids, type_ids in zip(encoding_dict["input_ids"], encoding_dict["token_type_ids"])
]
return encoding_dict

View File

@@ -21,7 +21,6 @@ import warnings
from functools import lru_cache from functools import lru_cache
import regex as re import regex as re
from tokenizers import ByteLevelBPETokenizer
from .tokenization_utils import AddedToken, PreTrainedTokenizer from .tokenization_utils import AddedToken, PreTrainedTokenizer
from .tokenization_utils_base import BatchEncoding from .tokenization_utils_base import BatchEncoding
@@ -360,6 +359,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["attention_mask"]
slow_tokenizer_class = GPT2Tokenizer
def __init__( def __init__(
self, self,
@@ -369,19 +369,15 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
bos_token="<|endoftext|>", bos_token="<|endoftext|>",
eos_token="<|endoftext|>", eos_token="<|endoftext|>",
add_prefix_space=False, add_prefix_space=False,
trim_offsets=True,
**kwargs **kwargs
): ):
super().__init__( super().__init__(
ByteLevelBPETokenizer( vocab_file,
vocab_file=vocab_file, merges_file,
merges_file=merges_file, unk_token=unk_token,
add_prefix_space=add_prefix_space,
trim_offsets=trim_offsets,
),
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
unk_token=unk_token, add_prefix_space=add_prefix_space,
**kwargs, **kwargs,
) )
self.add_prefix_space = add_prefix_space self.add_prefix_space = add_prefix_space
@@ -409,8 +405,9 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
FutureWarning, FutureWarning,
) )
is_split_into_words = kwargs.pop("is_pretokenized") is_split_into_words = kwargs.pop("is_pretokenized")
else:
is_split_into_words = kwargs.get("is_split_into_words", False)
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, ( assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs." "to use it with pretokenized inputs."

View File

@@ -69,3 +69,4 @@ class LongformerTokenizerFast(RobertaTokenizerFast):
"vocab_file": {m: vocab_url for m in _all_longformer_models}, "vocab_file": {m: vocab_url for m in _all_longformer_models},
"merges_file": {m: merges_url for m in _all_longformer_models}, "merges_file": {m: merges_url for m in _all_longformer_models},
} }
slow_tokenizer_class = LongformerTokenizer

View File

@@ -79,3 +79,4 @@ class LxmertTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = LxmertTokenizer

View File

@@ -15,10 +15,12 @@
from typing import List, Optional from typing import List, Optional
from tokenizers import processors
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .tokenization_utils import BatchEncoding from .tokenization_utils import BatchEncoding
from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
from .tokenization_xlm_roberta import XLMRobertaTokenizer from .tokenization_xlm_roberta import XLMRobertaTokenizer, XLMRobertaTokenizerFast
from .utils import logging from .utils import logging
@@ -109,6 +111,10 @@ class MBartTokenizer(XLMRobertaTokenizer):
self._additional_special_tokens = list(self.lang_code_to_id.keys()) self._additional_special_tokens = list(self.lang_code_to_id.keys())
self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX")) self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX"))
@property
def vocab_size(self):
return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1 # Plus 1 for the mask token
def get_special_tokens_mask( def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]: ) -> List[int]:
@@ -227,3 +233,185 @@ class MBartTokenizer(XLMRobertaTokenizer):
self.cur_lang_code = self.lang_code_to_id[lang] self.cur_lang_code = self.lang_code_to_id[lang]
self.prefix_tokens = [] self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
class MBartTokenizerFast(XLMRobertaTokenizerFast):
"""
Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast` and adds
a new :meth:`~transformers.MBartTokenizerFast.prepare_seq2seq_batch`.
Refer to superclass :class:`~transformers.XLMRobertaTokenizerFast` for usage examples and documentation concerning
the initialization parameters and other methods.
.. warning::
``prepare_seq2seq_batch`` should be used to encode inputs. Other tokenizer methods like ``encode`` do not work
properly.
The tokenization method is ``<tokens> <eos> <language code>`` for source language documents, and
``<language code> <tokens> <eos>``` for target language documents.
Examples::
>>> from transformers import MBartTokenizerFast
>>> tokenizer = MBartTokenizerFast.from_pretrained('facebook/mbart-large-en-ro')
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> batch: dict = tokenizer.prepare_seq2seq_batch(
... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian
... )
"""
vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
max_model_input_sizes = {m: 1024 for m in _all_mbart_models}
pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}}
slow_tokenizer_class = MBartTokenizer
prefix_tokens: List[int] = []
suffix_tokens: List[int] = []
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.cur_lang_code = self.convert_tokens_to_ids("en_XX")
self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX"))
self.add_special_tokens({"additional_special_tokens": FAIRSEQ_LANGUAGE_CODES})
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens)
if token_ids_1 is None:
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. The special tokens depend on calling set_lang.
An MBART sequence has the following format, where ``X`` represents the sequence:
- ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
- ``decoder_input_ids``: (for decoder) ``[tgt_lang_code] X [eos]``
BOS is never used.
Pairs of sequences are not the expected use case, but they will be handled without a separator.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
# We don't expect to process pairs, but leave the pair logic for API consistency
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
@add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
def prepare_seq2seq_batch(
self,
src_texts: List[str],
src_lang: str = "en_XX",
tgt_texts: Optional[List[str]] = None,
tgt_lang: str = "ro_RO",
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
truncation: bool = True,
padding: str = "longest",
return_tensors: str = "pt",
**kwargs,
) -> BatchEncoding:
if max_length is None:
max_length = self.max_len
self.set_src_lang_special_tokens(src_lang)
model_inputs: BatchEncoding = self(
src_texts,
add_special_tokens=True,
return_tensors=return_tensors,
max_length=max_length,
padding=padding,
truncation=truncation,
**kwargs,
)
if tgt_texts is None:
return model_inputs
# Process tgt_texts
if max_target_length is None:
max_target_length = max_length
self.set_tgt_lang_special_tokens(tgt_lang)
labels = self(
tgt_texts,
add_special_tokens=True,
return_tensors=return_tensors,
padding=padding,
max_length=max_target_length,
truncation=True,
**kwargs,
)["input_ids"]
model_inputs["labels"] = labels
self.set_src_lang_special_tokens(src_lang) # sets to src_lang
return model_inputs
def set_src_lang_special_tokens(self, src_lang) -> None:
"""Reset the special tokens to the source lang setting. No prefix and suffix=[eos, cur_lang_code]."""
self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
self._tokenizer.post_processor = processors.TemplateProcessing(
single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
)
def set_tgt_lang_special_tokens(self, lang: str) -> None:
"""Reset the special tokens to the target language setting. Prefix [tgt_lang_code], suffix =[eos]."""
self.cur_lang_code = self.convert_tokens_to_ids(lang)
self.prefix_tokens = []
self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
self._tokenizer.post_processor = processors.TemplateProcessing(
single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
)

View File

@@ -65,3 +65,4 @@ class MobileBertTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = MobileBertTokenizer

View File

@@ -19,8 +19,6 @@ import json
import os import os
import re import re
from tokenizers import CharBPETokenizer
from .tokenization_bert import BasicTokenizer from .tokenization_bert import BasicTokenizer
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast from .tokenization_utils_fast import PreTrainedTokenizerFast
@@ -123,6 +121,10 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {} self.cache = {}
@property
def do_lower_case(self):
return True
@property @property
def vocab_size(self): def vocab_size(self):
return len(self.encoder) return len(self.encoder)
@@ -243,9 +245,8 @@ class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with
the following peculiarities: the following peculiarities:
- lowercases all inputs, - lower case all inputs
- uses :obj:`SpaCy` tokenizer and :obj:`ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's - uses BERT's BasicTokenizer for pre-BPE tokenization
:obj:`BasicTokenizer` if not.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods. methods. Users should refer to this superclass for more information regarding those methods.
@@ -264,10 +265,11 @@ class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["attention_mask"]
slow_tokenizer_class = OpenAIGPTTokenizer
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs): def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
kwargs.setdefault("unk_token", unk_token) super().__init__(vocab_file, merges_file, unk_token=unk_token, **kwargs)
super().__init__(
CharBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, unk_token=unk_token, lowercase=True), @property
**kwargs, def do_lower_case(self):
) return True

View File

@@ -15,10 +15,23 @@
from typing import Dict, List, Optional from typing import Dict, List, Optional
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .tokenization_reformer import ReformerTokenizer from .tokenization_reformer import ReformerTokenizer, ReformerTokenizerFast
from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
SPIECE_UNDERLINE = ""
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/spiece.model"}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google/pegasus-xsum": 512,
}
class PegasusTokenizer(ReformerTokenizer): class PegasusTokenizer(ReformerTokenizer):
r""" r"""
Construct a Pegasus tokenizer. Construct a Pegasus tokenizer.
@@ -31,6 +44,8 @@ class PegasusTokenizer(ReformerTokenizer):
""" """
offset = 103 # entries 2-104 are only used for pretraining offset = 103 # entries 2-104 are only used for pretraining
vocab_files_names = {"vocab_file": "spiece.model"} vocab_files_names = {"vocab_file": "spiece.model"}
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@@ -150,3 +165,85 @@ class PegasusTokenizer(ReformerTokenizer):
# for k, v in decoder_inputs.items(): # for k, v in decoder_inputs.items():
# model_inputs[f"decoder_{k}"] = v # model_inputs[f"decoder_{k}"] = v
return model_inputs return model_inputs
class PegasusTokenizerFast(ReformerTokenizerFast):
offset = 103 # entries 2-104 are only used for pretraining
vocab_files_names = {"vocab_file": "spiece.model"}
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = PegasusTokenizer
# def num_special_tokens_to_add(self, pair=False):
# """Just EOS"""
# return 1
def _special_token_mask(self, seq):
all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp
all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special
assert all_special_ids == set([0, 1])
return [1 if x in all_special_ids else 0 for x in seq]
def get_special_tokens_mask(
self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""Get list where entries are [1] if a token is [eos] or [pad] else 0."""
if already_has_special_tokens:
return self._special_token_mask(token_ids_0)
elif token_ids_1 is None:
return self._special_token_mask(token_ids_0) + [1]
else:
return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
"""
Build model inputs from a sequence by adding eos to the end. no bos token is added to the front.
- single sequence: ``X </s>``
- pair of sequences: ``A B </s>`` (not intended use)
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return token_ids_0 + [self.eos_token_id]
# We don't expect to process pairs, but leave the pair logic for API consistency
return token_ids_0 + token_ids_1 + [self.eos_token_id]
@add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
def prepare_seq2seq_batch(
self,
src_texts: List[str],
tgt_texts: Optional[List[str]] = None,
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
return_tensors: str = "pt",
truncation=True,
padding="longest",
**unused,
) -> BatchEncoding:
if "" in src_texts:
raise ValueError(f"found empty string in src_texts: {src_texts}")
tokenizer_kwargs = dict(
add_special_tokens=True,
return_tensors=return_tensors,
max_length=max_length,
truncation=truncation,
padding=padding,
)
model_inputs: BatchEncoding = self(src_texts, **tokenizer_kwargs)
if tgt_texts is None:
return model_inputs
if max_target_length is not None:
tokenizer_kwargs["max_length"] = max_target_length
# TODO(@sshleifer): maybe tgt_texts = [self.pad_token + t for t in tgt_texts] # add decoder_start_token_id
labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
model_inputs["labels"] = labels
# for k, v in decoder_inputs.items():
# model_inputs[f"decoder_{k}"] = v
return model_inputs

View File

@@ -126,7 +126,6 @@ class PhobertTokenizer(PreTrainedTokenizer):
**kwargs **kwargs
): ):
super().__init__( super().__init__(
max_len=256,
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
unk_token=unk_token, unk_token=unk_token,

View File

@@ -19,6 +19,7 @@ import os
from shutil import copyfile from shutil import copyfile
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging from .utils import logging
@@ -184,3 +185,72 @@ class ReformerTokenizer(PreTrainedTokenizer):
copyfile(self.vocab_file, out_vocab_file) copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,) return (out_vocab_file,)
class ReformerTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
<https://github.com/google/sentencepiece>`__ .
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
slow_tokenizer_class = ReformerTokenizer
def __init__(
self,
vocab_file,
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
additional_special_tokens=[],
**kwargs
):
super().__init__(
vocab_file,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
def save_vocabulary(self, save_directory):
"""Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)

View File

@@ -71,4 +71,5 @@ class RetriBertTokenizerFast(BertTokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = RetriBertTokenizer
model_input_names = ["attention_mask"] model_input_names = ["attention_mask"]

View File

@@ -17,8 +17,6 @@
import warnings import warnings
from typing import List, Optional from typing import List, Optional
from tokenizers.processors import RobertaProcessing
from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
from .tokenization_utils import AddedToken from .tokenization_utils import AddedToken
from .utils import logging from .utils import logging
@@ -344,6 +342,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"] model_input_names = ["attention_mask"]
slow_tokenizer_class = RobertaTokenizer
def __init__( def __init__(
self, self,
@@ -358,38 +357,23 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
pad_token="<pad>", pad_token="<pad>",
mask_token="<mask>", mask_token="<mask>",
add_prefix_space=False, add_prefix_space=False,
trim_offsets=True,
**kwargs **kwargs
): ):
# Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
kwargs.setdefault("pad_token", pad_token)
kwargs.setdefault("sep_token", sep_token)
kwargs.setdefault("cls_token", cls_token)
kwargs.setdefault("mask_token", mask_token)
super().__init__( super().__init__(
vocab_file=vocab_file, vocab_file,
merges_file=merges_file, merges_file,
unk_token=unk_token, errors=errors,
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space, add_prefix_space=add_prefix_space,
trim_offsets=trim_offsets,
**kwargs, **kwargs,
) )
# This will add the necessary special tokens to the vocabulary if needed
self.sanitize_special_tokens()
self.backend_tokenizer._tokenizer.post_processor = RobertaProcessing(
sep=(sep_token, self.sep_token_id),
cls=(cls_token, self.cls_token_id),
add_prefix_space=add_prefix_space,
trim_offsets=trim_offsets,
)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
if token_ids_1 is None: if token_ids_1 is None:

View File

@@ -24,6 +24,7 @@ from typing import List, Optional
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .tokenization_utils import BatchEncoding, PreTrainedTokenizer from .tokenization_utils import BatchEncoding, PreTrainedTokenizer
from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging from .utils import logging
@@ -322,3 +323,161 @@ class T5Tokenizer(PreTrainedTokenizer):
) )
model_inputs["labels"] = labels_and_decoder_mask["input_ids"] model_inputs["labels"] = labels_and_decoder_mask["input_ids"]
return model_inputs return model_inputs
class T5TokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
<https://github.com/google/sentencepiece>`__ .
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
extra_ids (:obj:`int`, `optional`, defaults to 100):
Add a number of extra ids added to the end of the vocabulary for use as sentinels.
These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token
in the vocabulary like in T5 preprocessing see `here
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
slow_tokenizer_class = T5Tokenizer
prefix_tokens: List[int] = []
def __init__(
self,
vocab_file,
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
extra_ids=100,
additional_special_tokens=None,
**kwargs
):
super().__init__(
vocab_file,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
extra_ids=extra_ids,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self.vocab_file = vocab_file
self._extra_ids = extra_ids
def save_vocabulary(self, save_directory):
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A sequence has the following format:
- single sequence: ``X </s>``
- pair of sequences: ``A </s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
token_ids_0 = token_ids_0 + [self.eos_token_id]
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0
else:
token_ids_1 = token_ids_1 + [self.eos_token_id]
return self.prefix_tokens + token_ids_0 + token_ids_1
@add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING)
def prepare_seq2seq_batch(
self,
src_texts: List[str],
tgt_texts: Optional[List[str]] = None,
max_length: Optional[int] = None,
max_target_length: Optional[int] = None,
padding: str = "longest",
return_tensors: str = None,
truncation: bool = True,
**kwargs,
) -> BatchEncoding:
if max_length is None:
max_length = self.max_len
self.prefix_tokens = []
model_inputs = self(
src_texts,
add_special_tokens=True,
return_tensors=return_tensors,
max_length=max_length,
padding=padding,
truncation=truncation,
**kwargs,
)
if tgt_texts is None:
return model_inputs
# Process tgt_texts
if max_target_length is None:
max_target_length = max_length
# set prefix_tokens for target text
self.prefix_tokens = [self.pad_token_id]
labels_and_decoder_mask = self(
tgt_texts,
add_special_tokens=True,
return_tensors=return_tensors,
padding=padding,
max_length=max_target_length,
truncation=truncation,
**kwargs,
)
model_inputs["labels"] = labels_and_decoder_mask["input_ids"]
self.prefix_tokens = []
return model_inputs

View File

@@ -22,23 +22,15 @@ import glob
import os import os
import pickle import pickle
import re import re
import warnings
from collections import Counter, OrderedDict from collections import Counter, OrderedDict
from typing import List, Optional from typing import List
import numpy as np import numpy as np
import sacremoses as sm import sacremoses as sm
from tokenizers import Tokenizer
from tokenizers.implementations import BaseTokenizer
from tokenizers.models import WordLevel
from tokenizers.normalizers import Lowercase, Sequence, Strip, unicode_normalizer_from_str
from tokenizers.pre_tokenizers import CharDelimiterSplit, WhitespaceSplit
from tokenizers.processors import BertProcessing
from .file_utils import cached_path, is_torch_available, torch_only_method from .file_utils import cached_path, is_torch_available, torch_only_method
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging from .utils import logging
@@ -53,7 +45,6 @@ VOCAB_FILES_NAMES = {
"pretrained_vocab_file_torch": "vocab.bin", "pretrained_vocab_file_torch": "vocab.bin",
"vocab_file": "vocab.txt", "vocab_file": "vocab.txt",
} }
VOCAB_FILES_NAMES_FAST = {"pretrained_vocab_file": "vocab.json", "vocab_file": "vocab.json"}
PRETRAINED_VOCAB_FILES_MAP = { PRETRAINED_VOCAB_FILES_MAP = {
"pretrained_vocab_file": { "pretrained_vocab_file": {
@@ -61,12 +52,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
} }
} }
PRETRAINED_VOCAB_FILES_MAP_FAST = {
"pretrained_vocab_file": {
"transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.json",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"transfo-xl-wt103": None, "transfo-xl-wt103": None,
} }
@@ -240,6 +225,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
if vocab_file is not None: if vocab_file is not None:
self.build_vocab() self.build_vocab()
@property
def do_lower_case(self):
return self.lower_case
def _compile_space_around_punctuation_pattern(self): def _compile_space_around_punctuation_pattern(self):
look_ahead_for_special_token = "(?=[{}])".format(self.punctuation_symbols) look_ahead_for_special_token = "(?=[{}])".format(self.punctuation_symbols)
look_ahead_to_match_all_except_space = r"(?=[^\s])" look_ahead_to_match_all_except_space = r"(?=[^\s])"
@@ -299,11 +288,6 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
:obj:`Tuple(str)`: Paths to the files saved. :obj:`Tuple(str)`: Paths to the files saved.
""" """
logger.warning(
"Please note you will not be able to load the save vocabulary in"
" Rust-based TransfoXLTokenizerFast as they don't share the same structure."
)
if os.path.isdir(vocab_path): if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"]) vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"])
else: else:
@@ -492,165 +476,6 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
return symbols return symbols
class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer):
def __init__(
self,
vocab_file,
delimiter,
lowercase,
unk_token,
eos_token,
add_eos=False,
add_double_eos=False,
normalization: Optional[str] = None,
):
try:
tokenizer = WordLevel(vocab_file, unk_token=unk_token)
tokenizer = Tokenizer(tokenizer)
except Exception:
raise ValueError(
"Unable to parse file {}. Unknown format. "
"If you tried to load a model saved through TransfoXLTokenizer,"
"please note they are not compatible.".format(vocab_file)
)
# Create the correct normalization path
normalizer = []
# Include unicode normalization
if normalization:
normalizer += [unicode_normalizer_from_str(normalization)]
# Include case normalization
if lowercase:
normalizer += [Lowercase()]
# Strip normalizer at the end
normalizer += [Strip(left=True, right=True)]
if len(normalizer) > 0:
tokenizer.normalizer = Sequence(normalizer) if len(normalizer) > 1 else normalizer[0]
# Setup the splitter
tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter) if delimiter else WhitespaceSplit()
if add_double_eos:
tokenizer.post_processor = BertProcessing(
(eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))
)
parameters = {
"model": "TransfoXLModel",
"add_eos": add_eos,
"add_double_eos": add_double_eos,
"unk_token": unk_token,
"eos_token": eos_token,
"delimiter": delimiter,
"lowercase": lowercase,
}
super().__init__(tokenizer, parameters)
class TransfoXLTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" Transformer-XL tokenizer (backed by HuggingFace's `tokenizers` library) adapted from Vocab class
in `the original code <https://github.com/kimiyoung/transformer-xl>`__. The Transformer-XL tokenizer is a
word-level tokenizer (no sub-word tokenization).
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
special (:obj:`List[str]`, `optional`):
A list of special tokens (to be treated by the original implementation of this tokenizer).
min_freq (:obj:`int`, `optional`, defaults to 0):
The minimum number of times a token has to be present in order to be kept in the vocabulary (otherwise it
will be mapped to :obj:`unk_token`).
max_size (:obj:`int`, `optional`):
The maximum size of the vocabulary. If left unset, it will default to the size of the vocabulary found
after excluding the tokens according to the :obj:`min_freq` rule.
lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to lowercase the input when tokenizing.
delimiter (:obj:`str`, `optional`):
The delimiter used btween tokens.
vocab_file (:obj:`str`, `optional`):
File containing the vocabulary (from the original implementation).
pretrained_vocab_file (:obj:`str`, `optional`):
File containing the vocabulary as saved with the :obj:`save_pretrained()` method.
never_split (xxx, `optional`):
Fill me with intesting stuff.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"<eos>"`):
The end of sequence token.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<formula>"]`):
A list of additional special tokens (for the HuggingFace functionality).
add_eos (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add the end-of-sentence token.
add_double_eos (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add the end-of-sentence token.
normalization (xxx, `optional`):
Fill me with intesting stuff.
"""
vocab_files_names = VOCAB_FILES_NAMES_FAST
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP_FAST
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = []
def __init__(
self,
special=None,
min_freq=0,
max_size=None,
lower_case=False,
delimiter=None,
vocab_file=None,
pretrained_vocab_file=None,
never_split=None,
unk_token="<unk>",
eos_token="<eos>",
additional_special_tokens=["<formula>"],
add_eos=False,
add_double_eos=False,
normalization=None,
**kwargs
):
super().__init__(
_TransfoXLDelimiterLookupTokenizer(
vocab_file=vocab_file or pretrained_vocab_file,
delimiter=delimiter,
lowercase=lower_case,
unk_token=unk_token,
eos_token=eos_token,
add_eos=add_eos,
add_double_eos=add_double_eos,
normalization=normalization,
),
unk_token=unk_token,
eos_token=eos_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
warnings.warn(
"The class `TransfoXLTokenizerFast` is deprecated and will be removed in a future version. Please use `TransfoXLTokenizer` with it's enhanced tokenization instead.",
FutureWarning,
)
def save_pretrained(self, save_directory):
logger.warning(
"Please note you will not be able to load the vocabulary in"
" Python-based TransfoXLTokenizer as they don't share the same structure."
)
return super().save_pretrained(save_directory)
class LMOrderedIterator(object): class LMOrderedIterator(object):
def __init__(self, data, bsz, bptt, device="cpu", ext_len=None): def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
""" """

View File

@@ -15,7 +15,6 @@
""" Tokenization classes for python tokenizers. """ Tokenization classes for python tokenizers.
For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py
""" """
import itertools import itertools
import re import re
import unicodedata import unicodedata
@@ -45,6 +44,11 @@ from .utils import logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
# Slow tokenizers are saved in a vocabulary plus three separated files
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
def _is_whitespace(char): def _is_whitespace(char):
"""Checks whether `char` is a whitespace character.""" """Checks whether `char` is a whitespace character."""
@@ -190,7 +194,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
tokens_to_add = [] tokens_to_add = []
for token in new_tokens: for token in new_tokens:
assert isinstance(token, str) assert isinstance(token, str)
if not special_tokens and self.init_kwargs.get("do_lower_case", False): if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
token = token.lower() token = token.lower()
if ( if (
token != self.unk_token token != self.unk_token
@@ -239,6 +243,9 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
""" """
Converts a string in a sequence of tokens, using the tokenizer. Converts a string in a sequence of tokens, using the tokenizer.
Note that, unlike Fast tokenizers (instances of PreTrainedTokenizerFast), this method
won't replace the unknown tokens with the `unk_token` yet (this is done in the `encode()` method)
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
Takes care of added tokens. Takes care of added tokens.
@@ -268,7 +275,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
logger.warning(f"Keyword arguments {kwargs} not recognized.") logger.warning(f"Keyword arguments {kwargs} not recognized.")
# TODO: should this be in the base class? # TODO: should this be in the base class?
if self.init_kwargs.get("do_lower_case", False): if hasattr(self, "do_lower_case") and self.do_lower_case:
# convert non-special tokens to lowercase # convert non-special tokens to lowercase
escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens] escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
@@ -740,7 +747,11 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
return " ".join(tokens) return " ".join(tokens)
def decode( def decode(
self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
spaces_between_special_tokens: bool = True,
) -> str: ) -> str:
""" """
Converts a sequence of ids in a string, using the tokenizer and vocabulary Converts a sequence of ids in a string, using the tokenizer and vocabulary
@@ -755,6 +766,10 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`): clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to clean up the tokenization spaces. Whether or not to clean up the tokenization spaces.
spaces_between_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to add spaces around special tokens.
The behavior of Fast tokenizers is to have this to :obj:`False`.
This is setup to :obj:`True` in slow tokenizers for backward compatibility.
Returns: Returns:
:obj:`str`: The decoded sentence. :obj:`str`: The decoded sentence.
@@ -778,7 +793,11 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
current_sub_text.append(token) current_sub_text.append(token)
if current_sub_text: if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text)) sub_texts.append(self.convert_tokens_to_string(current_sub_text))
text = " ".join(sub_texts)
if spaces_between_special_tokens:
text = " ".join(sub_texts)
else:
text = "".join(sub_texts)
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text) clean_text = self.clean_up_tokenization(text)

View File

@@ -646,6 +646,8 @@ class SpecialTokensMixin:
# which are not yet in the vocabulary. Necesssary for serialization/de-serialization # which are not yet in the vocabulary. Necesssary for serialization/de-serialization
# TODO clean this up at some point (probably by sitching to fast tokenizers) # TODO clean this up at some point (probably by sitching to fast tokenizers)
for key, value in kwargs.items(): for key, value in kwargs.items():
if value is None:
continue
if key in self.SPECIAL_TOKENS_ATTRIBUTES: if key in self.SPECIAL_TOKENS_ATTRIBUTES:
if key == "additional_special_tokens": if key == "additional_special_tokens":
assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple" assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
@@ -778,6 +780,9 @@ class SpecialTokensMixin:
return self._add_tokens(new_tokens, special_tokens=special_tokens) return self._add_tokens(new_tokens, special_tokens=special_tokens)
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
raise NotImplementedError
@property @property
def bos_token(self) -> str: def bos_token(self) -> str:
""" """
@@ -1293,11 +1298,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_model_input_sizes: Dict[str, Optional[int]] = {} max_model_input_sizes: Dict[str, Optional[int]] = {}
model_input_names: List[str] = ["token_type_ids", "attention_mask"] model_input_names: List[str] = ["token_type_ids", "attention_mask"]
padding_side: str = "right" padding_side: str = "right"
slow_tokenizer_class = None
def __init__(self, **kwargs): def __init__(self, **kwargs):
# inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
self.init_inputs = () self.init_inputs = ()
self.init_kwargs = kwargs self.init_kwargs = copy.deepcopy(kwargs)
# For backward compatibility we fallback to set model_max_length from max_len if provided # For backward compatibility we fallback to set model_max_length from max_len if provided
model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None)) model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
@@ -1311,6 +1317,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}" ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
self.deprecation_warnings = (
{}
) # Use to store when we have already noticed a deprecation warning (avoid overlogging).
super().__init__(**kwargs) super().__init__(**kwargs)
@property @property
@@ -1343,9 +1353,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def max_len_single_sentence(self, value) -> int: def max_len_single_sentence(self, value) -> int:
# For backward compatibility, allow to try to setup 'max_len_single_sentence'. # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose: if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
logger.warning( if not self.deprecation_warnings.get("max_len_single_sentence", False):
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." logger.warning(
) "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
)
self.deprecation_warnings["max_len_single_sentence"] = True
else: else:
raise ValueError( raise ValueError(
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
@@ -1355,16 +1367,18 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def max_len_sentences_pair(self, value) -> int: def max_len_sentences_pair(self, value) -> int:
# For backward compatibility, allow to try to setup 'max_len_sentences_pair'. # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose: if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
logger.warning( if not self.deprecation_warnings.get("max_len_sentences_pair", False):
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." logger.warning(
) "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
)
self.deprecation_warnings["max_len_sentences_pair"] = True
else: else:
raise ValueError( raise ValueError(
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
) )
@classmethod @classmethod
def from_pretrained(cls, *inputs, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
r""" r"""
Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from
a predefined tokenizer. a predefined tokenizer.
@@ -1425,10 +1439,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
assert tokenizer.unk_token == '<unk>' assert tokenizer.unk_token == '<unk>'
""" """
return cls._from_pretrained(*inputs, **kwargs)
@classmethod
def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
cache_dir = kwargs.pop("cache_dir", None) cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False) force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False) resume_download = kwargs.pop("resume_download", False)
@@ -1475,7 +1485,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
"added_tokens_file": ADDED_TOKENS_FILE, "added_tokens_file": ADDED_TOKENS_FILE,
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
"tokenizer_config_file": TOKENIZER_CONFIG_FILE, "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
"full_tokenizer_file": FULL_TOKENIZER_FILE, "tokenizer_file": FULL_TOKENIZER_FILE,
} }
# Look for the tokenizer files # Look for the tokenizer files
for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
@@ -1541,6 +1551,28 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
else: else:
logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id])) logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
return cls._from_pretrained(
resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
)
@classmethod
def _from_pretrained(
cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
):
# We instantiate fast tokenizers based on a slow tokenizer for now
# In the future we can also use a direct way based on saving/instantiating
# tokenizer's Tokenizer directly from it's serialization JSON
if cls.slow_tokenizer_class is not None:
slow_tokenizer = cls.slow_tokenizer_class._from_pretrained(
copy.deepcopy(resolved_vocab_files),
pretrained_model_name_or_path,
copy.deepcopy(init_configuration),
*init_inputs,
**(copy.deepcopy(kwargs)),
)
else:
slow_tokenizer = None
# Prepare tokenizer initialization kwargs # Prepare tokenizer initialization kwargs
# Did we saved some inputs and kwargs to reload ? # Did we saved some inputs and kwargs to reload ?
tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
@@ -1556,6 +1588,19 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
# Update with newly provided kwargs # Update with newly provided kwargs
init_kwargs.update(kwargs) init_kwargs.update(kwargs)
# Convert AddedTokens serialized as dict to class instances
def convert_added_tokens(obj: Union[AddedToken, Any]):
if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
obj.pop("__type")
return AddedToken(**obj)
elif isinstance(obj, (list, tuple)):
return list(convert_added_tokens(o) for o in obj)
elif isinstance(obj, dict):
return {k: convert_added_tokens(v) for k, v in obj.items()}
return obj
init_kwargs = convert_added_tokens(init_kwargs)
# Set max length if needed # Set max length if needed
if pretrained_model_name_or_path in cls.max_model_input_sizes: if pretrained_model_name_or_path in cls.max_model_input_sizes:
# if we're using a pretrained model, ensure the tokenizer # if we're using a pretrained model, ensure the tokenizer
@@ -1570,6 +1615,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
if args_name not in init_kwargs: if args_name not in init_kwargs:
init_kwargs[args_name] = file_path init_kwargs[args_name] = file_path
if slow_tokenizer is not None:
init_kwargs["__slow_tokenizer"] = slow_tokenizer
# Instantiate tokenizer. # Instantiate tokenizer.
try: try:
tokenizer = cls(*init_inputs, **init_kwargs) tokenizer = cls(*init_inputs, **init_kwargs)
@@ -1580,8 +1628,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
) )
# Save inputs and kwargs for saving and re-loading with ``save_pretrained`` # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
tokenizer.init_inputs = init_inputs # Removed: Now done at the base class level
tokenizer.init_kwargs = init_kwargs # tokenizer.init_inputs = init_inputs
# tokenizer.init_kwargs = init_kwargs
# If there is a complementary special token map, load it # If there is a complementary special token map, load it
special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
@@ -1589,11 +1638,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
special_tokens_map = json.load(special_tokens_map_handle) special_tokens_map = json.load(special_tokens_map_handle)
special_tokens_map = convert_added_tokens(special_tokens_map)
for key, value in special_tokens_map.items(): for key, value in special_tokens_map.items():
if isinstance(value, dict):
value = AddedToken(**value)
elif isinstance(value, list):
value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
setattr(tokenizer, key, value) setattr(tokenizer, key, value)
# Add supplementary tokens. # Add supplementary tokens.
@@ -1623,14 +1669,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def save_pretrained(self, save_directory: str) -> Tuple[str]: def save_pretrained(self, save_directory: str) -> Tuple[str]:
""" """
Save the tokenizer vocabulary files together with: Save the full tokenizer state.
- added tokens,
- special tokens to class attributes mapping,
- tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
This method make sure the full tokenizer can then be re-loaded using the This method make sure the full tokenizer can then be re-loaded using the
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained` class method. :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.
.. Note::
A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with
this method will not be possible to load back
in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer` instance. It can only be loaded
in a "fast" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizerFast` instance.
.. Warning:: .. Warning::
This won't save modifications you may have applied to the tokenizer after the instantiation (for instance, This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
@@ -1648,7 +1697,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
os.makedirs(save_directory, exist_ok=True) os.makedirs(save_directory, exist_ok=True)
special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
tokenizer_config = copy.deepcopy(self.init_kwargs) tokenizer_config = copy.deepcopy(self.init_kwargs)
@@ -1657,22 +1705,33 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
for file_id in self.vocab_files_names.keys(): for file_id in self.vocab_files_names.keys():
tokenizer_config.pop(file_id, None) tokenizer_config.pop(file_id, None)
# Sanitize AddedTokens
def convert_added_tokens(obj: Union[AddedToken, Any]):
if isinstance(obj, AddedToken):
out = obj.__getstate__()
out["__type"] = "AddedToken"
return out
elif isinstance(obj, (list, tuple)):
return list(convert_added_tokens(o) for o in obj)
elif isinstance(obj, dict):
return {k: convert_added_tokens(v) for k, v in obj.items()}
return obj
tokenizer_config = convert_added_tokens(tokenizer_config)
with open(tokenizer_config_file, "w", encoding="utf-8") as f: with open(tokenizer_config_file, "w", encoding="utf-8") as f:
f.write(json.dumps(tokenizer_config, ensure_ascii=False)) f.write(json.dumps(tokenizer_config, ensure_ascii=False))
# Sanitize AddedTokens in special_tokens_map
write_dict = convert_added_tokens(self.special_tokens_map_extended)
with open(special_tokens_map_file, "w", encoding="utf-8") as f: with open(special_tokens_map_file, "w", encoding="utf-8") as f:
write_dict = {}
for key, value in self.special_tokens_map_extended.items():
if isinstance(value, AddedToken):
write_dict[key] = value.__getstate__()
elif isinstance(value, list):
write_dict[key] = [
token.__getstate__() if isinstance(token, AddedToken) else token for token in value
]
else:
write_dict[key] = value
f.write(json.dumps(write_dict, ensure_ascii=False)) f.write(json.dumps(write_dict, ensure_ascii=False))
file_names = (tokenizer_config_file, special_tokens_map_file)
return self._save_pretrained(save_directory, file_names)
def _save_pretrained(self, save_directory: str, file_names: Tuple[str]) -> Tuple[str]:
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
added_vocab = self.get_added_vocab() added_vocab = self.get_added_vocab()
if added_vocab: if added_vocab:
with open(added_tokens_file, "w", encoding="utf-8") as f: with open(added_tokens_file, "w", encoding="utf-8") as f:
@@ -1681,7 +1740,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
vocab_files = self.save_vocabulary(save_directory) vocab_files = self.save_vocabulary(save_directory)
return vocab_files + (special_tokens_map_file, added_tokens_file) return file_names + (vocab_files, added_tokens_file)
@add_end_docstrings( @add_end_docstrings(
ENCODE_KWARGS_DOCSTRING, ENCODE_KWARGS_DOCSTRING,
@@ -1752,13 +1811,15 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
# If you only set max_length, it activates truncation for max_length # If you only set max_length, it activates truncation for max_length
if max_length is not None and padding is False and truncation is False: if max_length is not None and padding is False and truncation is False:
if verbose: if verbose:
logger.warning( if not self.deprecation_warnings.get("Truncation-not-explicitely-activated", False):
"Truncation was not explicitely activated but `max_length` is provided a specific value, " logger.warning(
"please use `truncation=True` to explicitely truncate examples to max length. " "Truncation was not explicitely activated but `max_length` is provided a specific value, "
"Defaulting to 'longest_first' truncation strategy. " "please use `truncation=True` to explicitely truncate examples to max length. "
"If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy " "Defaulting to 'longest_first' truncation strategy. "
"more precisely by providing a specific strategy to `truncation`." "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
) "more precisely by providing a specific strategy to `truncation`."
)
self.deprecation_warnings["Truncation-not-explicitely-activated"] = True
truncation = "longest_first" truncation = "longest_first"
# Get padding strategy # Get padding strategy
@@ -1818,10 +1879,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
if padding_strategy == PaddingStrategy.MAX_LENGTH: if padding_strategy == PaddingStrategy.MAX_LENGTH:
if self.model_max_length > LARGE_INTEGER: if self.model_max_length > LARGE_INTEGER:
if verbose: if verbose:
logger.warning( if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. " logger.warning(
"Default to no padding." "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
) "Default to no padding."
)
self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
padding_strategy = PaddingStrategy.DO_NOT_PAD padding_strategy = PaddingStrategy.DO_NOT_PAD
else: else:
max_length = self.model_max_length max_length = self.model_max_length
@@ -1829,10 +1892,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
if self.model_max_length > LARGE_INTEGER: if self.model_max_length > LARGE_INTEGER:
if verbose: if verbose:
logger.warning( if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. " logger.warning(
"Default to no truncation." "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
) "Default to no truncation."
)
self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
else: else:
max_length = self.model_max_length max_length = self.model_max_length
@@ -2437,6 +2502,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
len_ids = len(ids) len_ids = len(ids)
len_pair_ids = len(pair_ids) if pair else 0 len_pair_ids = len(pair_ids) if pair else 0
if return_token_type_ids is not None and not add_special_tokens:
raise ValueError(
"Asking to return token_type_ids while setting add_special_tokens to False "
"results in an undefined behavior. Please set add_special_tokens to True or "
"set return_token_type_ids to None."
)
# Load from model defaults # Load from model defaults
if return_token_type_ids is None: if return_token_type_ids is None:
return_token_type_ids = "token_type_ids" in self.model_input_names return_token_type_ids = "token_type_ids" in self.model_input_names
@@ -2469,7 +2541,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
else: else:
sequence = ids + pair_ids if pair else ids sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
# Build output dictionnary # Build output dictionnary
encoded_inputs["input_ids"] = sequence encoded_inputs["input_ids"] = sequence
@@ -2483,11 +2555,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
# Check lengths # Check lengths
if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose: if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose:
logger.warning( if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
"Token indices sequence length is longer than the specified maximum sequence length " logger.warning(
"for this model ({} > {}). Running this sequence through the model will result in " "Token indices sequence length is longer than the specified maximum sequence length "
"indexing errors".format(len(encoded_inputs["input_ids"]), self.model_max_length) "for this model ({} > {}). Running this sequence through the model will result in "
) "indexing errors".format(len(encoded_inputs["input_ids"]), self.model_max_length)
)
self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
# Padding # Padding
if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
@@ -2703,7 +2777,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
] ]
def decode( def decode(
self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
**kwargs
) -> str: ) -> str:
""" """
Converts a sequence of ids in a string, using the tokenizer and vocabulary Converts a sequence of ids in a string, using the tokenizer and vocabulary

View File

@@ -16,16 +16,19 @@
For slow (python) tokenizers see tokenization_utils.py For slow (python) tokenizers see tokenization_utils.py
""" """
import copy
import os import os
import warnings import warnings
from collections import defaultdict from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Tuple, Union
from tokenizers import Encoding as EncodingFast from tokenizers import Encoding as EncodingFast
from tokenizers import Tokenizer as TokenizerFast
from tokenizers.decoders import Decoder as DecoderFast from tokenizers.decoders import Decoder as DecoderFast
from tokenizers.implementations import BaseTokenizer as BaseTokenizerFast
from .convert_slow_tokenizer import convert_slow_tokenizer
from .file_utils import add_end_docstrings from .file_utils import add_end_docstrings
from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_base import ( from .tokenization_utils_base import (
INIT_TOKENIZER_DOCSTRING, INIT_TOKENIZER_DOCSTRING,
AddedToken, AddedToken,
@@ -44,6 +47,15 @@ from .utils import logging
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
TOKENIZER_FILE = "tokenizer.json"
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
# Slow tokenizers have an additional addedd tokens files
ADDED_TOKENS_FILE = "added_tokens.json"
@add_end_docstrings( @add_end_docstrings(
INIT_TOKENIZER_DOCSTRING, INIT_TOKENIZER_DOCSTRING,
""" """
@@ -64,12 +76,19 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
dictionary structures (BPE, sentencepiece...). dictionary structures (BPE, sentencepiece...).
""" """
def __init__(self, tokenizer: BaseTokenizerFast, **kwargs): slow_tokenizer_class: PreTrainedTokenizer = None
if not isinstance(tokenizer, BaseTokenizerFast):
raise ValueError( def __init__(self, *args, **kwargs):
"Tokenizer should be an instance of a BaseTokenizer " "provided by HuggingFace tokenizers library." # We instantiate fast tokenizers based on a slow tokenizer for now
) # In the future we'll also use a direct way based on saving/instantiating
self._tokenizer: BaseTokenizerFast = tokenizer # tokenizer's Tokenizer directly from it's serialization JSON
if "__slow_tokenizer" in kwargs and kwargs["__slow_tokenizer"]:
slow_tokenizer = kwargs.pop("__slow_tokenizer")
else:
slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
self._tokenizer = convert_slow_tokenizer(slow_tokenizer)
kwargs = copy.deepcopy(slow_tokenizer.init_kwargs)
# We call this after having initialized the backend tokenizer because we update it. # We call this after having initialized the backend tokenizer because we update it.
super().__init__(**kwargs) super().__init__(**kwargs)
@@ -116,7 +135,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
return self._tokenizer.get_vocab_size(with_added_tokens=True) return self._tokenizer.get_vocab_size(with_added_tokens=True)
@property @property
def backend_tokenizer(self) -> BaseTokenizerFast: def backend_tokenizer(self) -> TokenizerFast:
""" """
:obj:`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend. :obj:`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
""" """
@@ -259,6 +278,9 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
""" """
Converts a string in a sequence of tokens, using the backend Rust tokenizer. Converts a string in a sequence of tokens, using the backend Rust tokenizer.
Note that, unlike slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method
will replace the unknown tokens with the :obj:`unk_token`.
Args: Args:
text (:obj:`str`): text (:obj:`str`):
The sequence to be encoded. The sequence to be encoded.
@@ -343,7 +365,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
) -> BatchEncoding: ) -> BatchEncoding:
if not isinstance(batch_text_or_text_pairs, list): if not isinstance(batch_text_or_text_pairs, list):
raise ValueError( raise TypeError(
"batch_text_or_text_pairs has to be a list (got {})".format(type(batch_text_or_text_pairs)) "batch_text_or_text_pairs has to be a list (got {})".format(type(batch_text_or_text_pairs))
) )
@@ -487,7 +509,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
return batched_output return batched_output
def decode( def decode(
self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True self,
token_ids: Union[int, List[int]],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
**kwargs
) -> str: ) -> str:
""" """
Converts a sequence of ids in a string, using the tokenizer and vocabulary Converts a sequence of ids in a string, using the tokenizer and vocabulary
@@ -496,7 +522,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
Args: Args:
token_ids (:obj:`List[int]`): token_ids (:obj:`Union[int, List[int]]`):
List of tokenized input ids. Can be obtained using the ``__call__`` method. List of tokenized input ids. Can be obtained using the ``__call__`` method.
skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to remove special tokens in the decoding. Whether or not to remove special tokens in the decoding.
@@ -506,6 +532,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
Returns: Returns:
:obj:`str`: The decoded sentence. :obj:`str`: The decoded sentence.
""" """
if isinstance(token_ids, int):
token_ids = [token_ids]
text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
if clean_up_tokenization_spaces: if clean_up_tokenization_spaces:
@@ -520,8 +548,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
and special token mappings. and special token mappings.
.. warning:: .. warning::
Please use :meth:`~transformers.PreTrainedTokenizer.save_pretrained` to save the full tokenizer state if Please use :meth:`~transformers.PreTrainedTokenizerFast.save_pretrained` to save the full tokenizer state if
you want to reload it using the :meth:`~transformers.PreTrainedTokenizer.from_pretrained` class method. you want to reload it using the :meth:`~transformers.PreTrainedTokenizerFast.from_pretrained` class method.
Args: Args:
save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved. save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
@@ -530,7 +558,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
A tuple of :obj:`str`: The files saved. A tuple of :obj:`str`: The files saved.
""" """
if os.path.isdir(save_directory): if os.path.isdir(save_directory):
files = self._tokenizer.save_model(save_directory) files = self._tokenizer.model.save(save_directory)
else: else:
folder, file = os.path.split(os.path.abspath(save_directory)) folder, file = os.path.split(os.path.abspath(save_directory))
files = self._tokenizer.save_model(folder, name=file) files = self._tokenizer.save_model(folder, name=file)

View File

@@ -648,6 +648,10 @@ class XLMTokenizer(PreTrainedTokenizer):
self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {} self.cache = {}
@property
def do_lower_case(self):
return self.do_lowercase_and_remove_accent
def moses_punct_norm(self, text, lang): def moses_punct_norm(self, text, lang):
if lang not in self.cache_moses_punct_normalizer: if lang not in self.cache_moses_punct_normalizer:
punct_normalizer = sm.MosesPunctNormalizer(lang=lang) punct_normalizer = sm.MosesPunctNormalizer(lang=lang)

View File

@@ -20,6 +20,7 @@ from shutil import copyfile
from typing import List, Optional from typing import List, Optional
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .tokenization_xlnet import SPIECE_UNDERLINE from .tokenization_xlnet import SPIECE_UNDERLINE
from .utils import logging from .utils import logging
@@ -307,3 +308,190 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
copyfile(self.vocab_file, out_vocab_file) copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,) return (out_vocab_file,)
class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
:class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `SentencePiece
<https://github.com/google/sentencepiece>`__.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
slow_tokenizer_class = XLMRobertaTokenizer
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
**kwargs
):
super().__init__(
vocab_file,
bos_token=bos_token,
eos_token=eos_token,
sep_token=sep_token,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
**kwargs,
)
self.vocab_file = vocab_file
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An XLM-RoBERTa sequence has the following format:
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
XLM-RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def save_vocabulary(self, save_directory):
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)

View File

@@ -21,6 +21,7 @@ from shutil import copyfile
from typing import List, Optional from typing import List, Optional
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging from .utils import logging
@@ -344,3 +345,213 @@ class XLNetTokenizer(PreTrainedTokenizer):
copyfile(self.vocab_file, out_vocab_file) copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,) return (out_vocab_file,)
class XLNetTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on
`SentencePiece <https://github.com/google/sentencepiece>`__.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to keep accents when tokenizing.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"<sep>"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"<cls>"`):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
padding_side = "left"
slow_tokenizer_class = XLNetTokenizer
def __init__(
self,
vocab_file,
do_lower_case=False,
remove_space=True,
keep_accents=False,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
sep_token="<sep>",
pad_token="<pad>",
cls_token="<cls>",
mask_token="<mask>",
additional_special_tokens=["<eop>", "<eod>"],
**kwargs
):
super().__init__(
vocab_file=vocab_file,
do_lower_case=do_lower_case,
remove_space=remove_space,
keep_accents=keep_accents,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
**kwargs,
)
self._pad_token_type_id = 3
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An XLNet sequence has the following format:
- single sequence: ``X <sep> <cls>``
- pair of sequences: ``A <sep> B <sep> <cls>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return token_ids_0 + sep + cls
return token_ids_0 + sep + token_ids_1 + sep + cls
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
return ([0] * len(token_ids_0)) + [1, 1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLNet sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls_segment_id = [2]
if token_ids_1 is None:
return len(token_ids_0 + sep) * [0] + cls_segment_id
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory):
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)

File diff suppressed because it is too large Load Diff

View File

@@ -17,7 +17,7 @@
import os import os
import unittest import unittest
from transformers.tokenization_albert import AlbertTokenizer from transformers.tokenization_albert import AlbertTokenizer, AlbertTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@@ -28,6 +28,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = AlbertTokenizer tokenizer_class = AlbertTokenizer
rust_tokenizer_class = AlbertTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -41,6 +43,28 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
output_text = "this is a test" output_text = "this is a test"
return input_text, output_text return input_text, output_text
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
def test_full_tokenizer(self): def test_full_tokenizer(self):
tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)

View File

@@ -12,6 +12,8 @@ from .test_tokenization_common import TokenizerTesterMixin
class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BartTokenizer tokenizer_class = BartTokenizer
rust_tokenizer_class = BartTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -35,7 +35,9 @@ from .test_tokenization_common import TokenizerTesterMixin
class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertTokenizer tokenizer_class = BertTokenizer
rust_tokenizer_class = BertTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
space_between_special_tokens = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -61,9 +63,6 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_rust_tokenizer(self, **kwargs):
return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00E9d,running" input_text = "UNwant\u00E9d,running"
output_text = "unwanted, running" output_text = "unwanted, running"

View File

@@ -15,6 +15,7 @@
import os import os
import pickle
import unittest import unittest
from transformers.testing_utils import custom_tokenizers from transformers.testing_utils import custom_tokenizers
@@ -33,6 +34,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertJapaneseTokenizer tokenizer_class = BertJapaneseTokenizer
space_between_special_tokens = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -87,6 +89,26 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""]) self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
def test_pickle_mecab_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="mecab")
self.assertIsNotNone(tokenizer)
text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "", "世界", "", "こん", "##ばんは", "", "世界", ""])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
filename = os.path.join(self.tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)
with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)
tokens_loaded = tokenizer_new.tokenize(text)
self.assertListEqual(tokens, tokens_loaded)
def test_mecab_tokenizer_ipadic(self): def test_mecab_tokenizer_ipadic(self):
tokenizer = MecabTokenizer(mecab_dic="ipadic") tokenizer = MecabTokenizer(mecab_dic="ipadic")

View File

@@ -0,0 +1,64 @@
# coding=utf-8
# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
from transformers.testing_utils import _torch_available
from transformers.tokenization_camembert import CamembertTokenizer, CamembertTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
FRAMEWORK = "pt" if _torch_available else "tf"
class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CamembertTokenizer
rust_tokenizer_class = CamembertTokenizerFast
test_rust_tokenizer = True
def setUp(self):
super().setUp()
# We have a SentencePiece fixture for testing
tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname)
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

View File

@@ -56,7 +56,9 @@ def merge_model_tokenizer_mappings(
class TokenizerTesterMixin: class TokenizerTesterMixin:
tokenizer_class = None tokenizer_class = None
rust_tokenizer_class = None
test_rust_tokenizer = False test_rust_tokenizer = False
space_between_special_tokens = False
def setUp(self): def setUp(self):
self.tmpdirname = tempfile.mkdtemp() self.tmpdirname = tempfile.mkdtemp()
@@ -68,12 +70,15 @@ class TokenizerTesterMixin:
input_txt = self.get_clean_sequence(tokenizer)[0] input_txt = self.get_clean_sequence(tokenizer)[0]
return input_txt, input_txt return input_txt, input_txt
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20) -> Tuple[str, list]: def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))] toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))]
toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks)) toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks)) toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
if max_length is not None and len(toks) > max_length: if max_length is not None and len(toks) > max_length:
toks = toks[:max_length] toks = toks[:max_length]
if min_length is not None and len(toks) < min_length and len(toks) > 0:
while len(toks) < min_length:
toks = toks + toks
# toks_str = [t[1] for t in toks] # toks_str = [t[1] for t in toks]
toks_ids = [t[0] for t in toks] toks_ids = [t[0] for t in toks]
@@ -99,7 +104,7 @@ class TokenizerTesterMixin:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast: def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
raise NotImplementedError return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
# def get_input_output_texts(self) -> Tuple[str, str]: # def get_input_output_texts(self) -> Tuple[str, str]:
# """Feel free to overwrite""" # """Feel free to overwrite"""
@@ -118,6 +123,29 @@ class TokenizerTesterMixin:
for i in range(len(batch_encode_plus_sequences["input_ids"])) for i in range(len(batch_encode_plus_sequences["input_ids"]))
] ]
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence, _ = self.get_input_output_texts(tokenizer)
# We don't have an exact equivalence on `tokenize()` between Rust and Slow
# Slow tokenizer only split tokens, Rust tokenizers will replace with <unk>
# tokens = tokenizer.tokenize(sequence)
# rust_tokens = rust_tokenizer.tokenize(sequence)
# self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
ids = tokenizer.encode(sequence, add_special_tokens=True)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=True)
self.assertListEqual(ids, rust_ids)
def test_tokenizers_common_properties(self): def test_tokenizers_common_properties(self):
tokenizers = self.get_tokenizers() tokenizers = self.get_tokenizers()
for tokenizer in tokenizers: for tokenizer in tokenizers:
@@ -241,6 +269,9 @@ class TokenizerTesterMixin:
tokenizers = self.get_tokenizers(fast=False, do_lower_case=True) tokenizers = self.get_tokenizers(fast=False, do_lower_case=True)
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
if not hasattr(tokenizer, "do_lower_case") or not tokenizer.do_lower_case:
continue
special_token = tokenizer.all_special_tokens[0] special_token = tokenizer.all_special_tokens[0]
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
@@ -272,6 +303,9 @@ class TokenizerTesterMixin:
tokenizers = self.get_tokenizers(fast=False, do_lower_case=False) tokenizers = self.get_tokenizers(fast=False, do_lower_case=False)
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
if hasattr(tokenizer, "do_lower_case") and tokenizer.do_lower_case:
continue
special_token = tokenizer.all_special_tokens[0] special_token = tokenizer.all_special_tokens[0]
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
@@ -282,7 +316,7 @@ class TokenizerTesterMixin:
toks0 = tokenizer.tokenize(text) # toks before adding new_toks toks0 = tokenizer.tokenize(text) # toks before adding new_toks
added = tokenizer.add_tokens(new_toks) added = tokenizer.add_tokens(new_toks)
self.assertEqual(added, 4) self.assertIn(added, [2, 4])
toks = tokenizer.tokenize(text) toks = tokenizer.tokenize(text)
toks2 = tokenizer.tokenize(text2) toks2 = tokenizer.tokenize(text2)
@@ -390,12 +424,17 @@ class TokenizerTesterMixin:
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
new_toks = ["[ABC]", "[DEF]"] # TODO(thom) add this one back when Rust toks are ready: , "GHI IHG"] # new_toks = ["[ABC]", "[DEF]"] # TODO(thom) add this one back when Rust toks are ready: , "GHI IHG"]
new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
tokenizer.add_tokens(new_toks) tokenizer.add_tokens(new_toks)
input = "[ABC] [DEF] [ABC] [DEF]" # TODO(thom) add back cf above: "[ABC] [DEF] [ABC] GHI IHG [DEF]" input = "[ABC][DEF][ABC][DEF]" # TODO(thom) add back cf above: "[ABC] [DEF] [ABC] GHI IHG [DEF]"
if self.space_between_special_tokens:
output = "[ABC] [DEF] [ABC] [DEF]"
else:
output = input
encoded = tokenizer.encode(input, add_special_tokens=False) encoded = tokenizer.encode(input, add_special_tokens=False)
decoded = tokenizer.decode(encoded) decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
self.assertEqual(decoded, input) self.assertIn(decoded, [output, output.lower()])
def test_pretrained_model_lists(self): def test_pretrained_model_lists(self):
weights_list = list(self.tokenizer_class.max_model_input_sizes.keys()) weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
@@ -447,7 +486,7 @@ class TokenizerTesterMixin:
sequence = tokenizer.encode(seq_0, add_special_tokens=False) sequence = tokenizer.encode(seq_0, add_special_tokens=False)
total_length = len(sequence) total_length = len(sequence)
assert total_length > 1, "Issue with the testing sequence, please update it it's too short" assert total_length > 4, "Issue with the testing sequence, please update it it's too short"
# Test with max model input length # Test with max model input length
model_max_length = tokenizer.model_max_length model_max_length = tokenizer.model_max_length
@@ -546,6 +585,7 @@ class TokenizerTesterMixin:
model_max_length = tokenizer.model_max_length model_max_length = tokenizer.model_max_length
self.assertEqual(model_max_length, 100) self.assertEqual(model_max_length, 100)
seq_2 = seq_0 * model_max_length seq_2 = seq_0 * model_max_length
assert len(seq_2) > model_max_length
sequence1 = tokenizer(seq_1, add_special_tokens=False) sequence1 = tokenizer(seq_1, add_special_tokens=False)
total_length1 = len(sequence1["input_ids"]) total_length1 = len(sequence1["input_ids"])
@@ -559,9 +599,9 @@ class TokenizerTesterMixin:
[False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False] [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
) )
for padding_state in padding_strategies: for padding_state in padding_strategies:
with self.subTest(f"Padding: {padding_state}"): with self.subTest(f"{tokenizer.__class__.__name__} Padding: {padding_state}"):
for truncation_state in [True, "longest_first", "only_first"]: for truncation_state in [True, "longest_first", "only_first"]:
with self.subTest(f"Truncation: {truncation_state}"): with self.subTest(f"{tokenizer.__class__.__name__} Truncation: {truncation_state}"):
output = tokenizer(seq_2, seq_1, padding=padding_state, truncation=truncation_state) output = tokenizer(seq_2, seq_1, padding=padding_state, truncation=truncation_state)
self.assertEqual(len(output["input_ids"]), model_max_length) self.assertEqual(len(output["input_ids"]), model_max_length)
@@ -748,34 +788,47 @@ class TokenizerTesterMixin:
# # This is not supported with the Rust tokenizers # # This is not supported with the Rust tokenizers
# # self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input) # # self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
def test_swap_special_token(self): # def test_swap_special_token(self):
tokenizers = self.get_tokenizers(do_lower_case=False) # tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers: # for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): # with self.subTest(f"{tokenizer.__class__.__name__}"):
mask = "<mask>" # # Our mask token
sequence = "Encode this sequence" # mask = "<mask>"
sequence_masked_0 = "Encode <mask> sequence" # # We take a single word in the middle of the vocabulary
sequence_masked_1 = "<mask> this sequence" # all_tokens = sorted(tokenizer.get_vocab().keys())
# word = tokenizer.decode(tokenizer.encode(all_tokens[len(all_tokens)//2], add_special_tokens=False)[:1])
# Add tokens so that masked token isn't split # sequence_0 = "Encode " + word + " sequence"
tokenizer.add_tokens(sequence.split()) # sequence_masked_0 = "Encode " + mask + " sequence"
tokenizer.add_special_tokens({"mask_token": mask})
mask_ind = tokenizer.convert_tokens_to_ids(mask)
encoded = tokenizer.encode(sequence, add_special_tokens=False)
# Test first masked sequence # sequence_1 = word + " this sequence"
encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False) # sequence_masked_1 = mask + " this sequence"
mask_loc = encoded_masked.index(mask_ind)
encoded_masked[mask_loc] = encoded[mask_loc]
self.assertEqual(encoded_masked, encoded) # # Add tokens so that masked token isn't split
# # tokens = [AddedToken(t, lstrip=True, normalized=False) for t in sequence.split()]
# # tokenizer.add_tokens(tokens)
# tokenizer.add_special_tokens(
# {"mask_token": AddedToken(mask, normalized=False)}
# ) # Eat left space on Byte-level BPE tokenizers
# mask_ind = tokenizer.convert_tokens_to_ids(mask)
# Test second masked sequence # # Test first masked sequence
encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False) # encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False)
mask_loc = encoded_masked.index(mask_ind) # encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
encoded_masked[mask_loc] = encoded[mask_loc] # assert len(encoded_masked) == len(encoded_0)
# mask_loc = encoded_masked.index(mask_ind)
# encoded_masked[mask_loc] = encoded_0[mask_loc]
self.assertEqual(encoded_masked, encoded) # self.assertEqual(encoded_masked, encoded_0)
# # Test second masked sequence
# encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
# assert len(encoded_masked) == len(encoded_1)
# mask_loc = encoded_masked.index(mask_ind)
# encoded_masked[mask_loc] = encoded_1[mask_loc]
# self.assertEqual(encoded_masked, encoded_1)
def test_special_tokens_mask(self): def test_special_tokens_mask(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
@@ -919,10 +972,10 @@ class TokenizerTesterMixin:
def test_padding_to_multiple_of(self): def test_padding_to_multiple_of(self):
tokenizers = self.get_tokenizers() tokenizers = self.get_tokenizers()
for tokenizer in tokenizers: for tokenizer in tokenizers:
if tokenizer.pad_token is None: with self.subTest(f"{tokenizer.__class__.__name__}"):
self.skipTest("No padding token.") if tokenizer.pad_token is None:
else: self.skipTest("No padding token.")
with self.subTest(f"{tokenizer.__class__.__name__}"): else:
empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8) empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8) normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
for key, value in empty_tokens.items(): for key, value in empty_tokens.items():
@@ -1063,14 +1116,15 @@ class TokenizerTesterMixin:
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
vocab = tokenizer.get_vocab() vocab_dict = tokenizer.get_vocab()
self.assertIsInstance(vocab_dict, dict)
self.assertGreaterEqual(len(tokenizer), len(vocab_dict))
self.assertIsInstance(vocab, dict) vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
self.assertEqual(len(vocab), len(tokenizer)) self.assertEqual(len(vocab), len(tokenizer))
tokenizer.add_tokens(["asdfasdfasdfasdf"]) tokenizer.add_tokens(["asdfasdfasdfasdf"])
vocab = tokenizer.get_vocab() vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
self.assertIsInstance(vocab, dict)
self.assertEqual(len(vocab), len(tokenizer)) self.assertEqual(len(vocab), len(tokenizer))
def test_conversion_reversible(self): def test_conversion_reversible(self):
@@ -1079,6 +1133,8 @@ class TokenizerTesterMixin:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
vocab = tokenizer.get_vocab() vocab = tokenizer.get_vocab()
for word, ind in vocab.items(): for word, ind in vocab.items():
if word == tokenizer.unk_token:
continue
self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind) self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind)
self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word) self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word)
@@ -1173,12 +1229,13 @@ class TokenizerTesterMixin:
def test_added_token_serializable(self): def test_added_token_serializable(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers: for tokenizer in tokenizers:
new_token = AddedToken("new_token", lstrip=True) with self.subTest(f"{tokenizer.__class__.__name__}"):
tokenizer.add_special_tokens({"additional_special_tokens": [new_token]}) new_token = AddedToken("new_token", lstrip=True)
tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
with tempfile.TemporaryDirectory() as tmp_dir_name: with tempfile.TemporaryDirectory() as tmp_dir_name:
tokenizer.save_pretrained(tmp_dir_name) tokenizer.save_pretrained(tmp_dir_name)
tokenizer.from_pretrained(tmp_dir_name) tokenizer.from_pretrained(tmp_dir_name)
def test_batch_encode_plus_padding(self): def test_batch_encode_plus_padding(self):
# Test that padded sequences are equivalent between batch_encode_plus and encode_plus # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
@@ -1243,6 +1300,9 @@ class TokenizerTesterMixin:
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
if hasattr(tokenizer, "add_prefix_space") and not tokenizer.add_prefix_space:
continue
# Prepare a sequence from our tokenizer vocabulary # Prepare a sequence from our tokenizer vocabulary
sequence, ids = self.get_clean_sequence(tokenizer, with_prefix_space=True, max_length=20) sequence, ids = self.get_clean_sequence(tokenizer, with_prefix_space=True, max_length=20)
# sequence = " " + sequence # To be sure the byte-level tokenizers are feeling good # sequence = " " + sequence # To be sure the byte-level tokenizers are feeling good
@@ -1345,12 +1405,14 @@ class TokenizerTesterMixin:
def test_prepare_for_model(self): def test_prepare_for_model(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers: for tokenizer in tokenizers:
string_sequence = "Testing the prepare_for_model method." with self.subTest(f"{tokenizer.__class__.__name__}"):
ids = tokenizer.encode(string_sequence, add_special_tokens=False) string_sequence = "Testing the prepare_for_model method."
input_dict = tokenizer.encode_plus(string_sequence) ids = tokenizer.encode(string_sequence, add_special_tokens=False)
prepared_input_dict = tokenizer.prepare_for_model(ids) prepared_input_dict = tokenizer.prepare_for_model(ids, add_special_tokens=True)
self.assertEqual(input_dict, prepared_input_dict) input_dict = tokenizer.encode_plus(string_sequence, add_special_tokens=True)
self.assertEqual(input_dict, prepared_input_dict)
def test_batch_encode_plus_overflowing_tokens(self): def test_batch_encode_plus_overflowing_tokens(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)

View File

@@ -25,6 +25,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CTRLTokenizer tokenizer_class = CTRLTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -23,9 +23,8 @@ from .test_tokenization_bert import BertTokenizationTest
class DistilBertTokenizationTest(BertTokenizationTest): class DistilBertTokenizationTest(BertTokenizationTest):
tokenizer_class = DistilBertTokenizer tokenizer_class = DistilBertTokenizer
rust_tokenizer_class = DistilBertTokenizerFast
def get_rust_tokenizer(self, **kwargs): test_rust_tokenizer = True
return DistilBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@slow @slow
def test_sequence_builders(self): def test_sequence_builders(self):

View File

@@ -32,25 +32,22 @@ from .test_tokenization_bert import BertTokenizationTest
class DPRContextEncoderTokenizationTest(BertTokenizationTest): class DPRContextEncoderTokenizationTest(BertTokenizationTest):
tokenizer_class = DPRContextEncoderTokenizer tokenizer_class = DPRContextEncoderTokenizer
rust_tokenizer_class = DPRContextEncoderTokenizerFast
def get_rust_tokenizer(self, **kwargs): test_rust_tokenizer = True
return DPRContextEncoderTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
tokenizer_class = DPRQuestionEncoderTokenizer tokenizer_class = DPRQuestionEncoderTokenizer
rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
def get_rust_tokenizer(self, **kwargs): test_rust_tokenizer = True
return DPRQuestionEncoderTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
class DPRReaderTokenizationTest(BertTokenizationTest): class DPRReaderTokenizationTest(BertTokenizationTest):
tokenizer_class = DPRReaderTokenizer tokenizer_class = DPRReaderTokenizer
rust_tokenizer_class = DPRReaderTokenizerFast
def get_rust_tokenizer(self, **kwargs): test_rust_tokenizer = True
return DPRReaderTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@slow @slow
def test_decode_best_spans(self): def test_decode_best_spans(self):

File diff suppressed because it is too large Load Diff

View File

@@ -26,6 +26,7 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = FunnelTokenizer tokenizer_class = FunnelTokenizer
test_rust_tokenizer = True test_rust_tokenizer = True
space_between_special_tokens = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -26,6 +26,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = GPT2Tokenizer tokenizer_class = GPT2Tokenizer
rust_tokenizer_class = GPT2TokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
def setUp(self): def setUp(self):

View File

@@ -18,7 +18,7 @@ import os
import unittest import unittest
from transformers.tokenization_bert import VOCAB_FILES_NAMES from transformers.tokenization_bert import VOCAB_FILES_NAMES
from transformers.tokenization_lxmert import LxmertTokenizer from transformers.tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@@ -26,6 +26,9 @@ from .test_tokenization_common import TokenizerTesterMixin
class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = LxmertTokenizer tokenizer_class = LxmertTokenizer
rust_tokenizer_class = LxmertTokenizerFast
test_rust_tokenizer = True
space_between_special_tokens = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -49,9 +52,6 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs):
return LxmertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00E9d,running" input_text = "UNwant\u00E9d,running"
output_text = "unwanted, running" output_text = "unwanted, running"
@@ -63,3 +63,25 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokens = tokenizer.tokenize("UNwant\u00E9d,running") tokens = tokenizer.tokenize("UNwant\u00E9d,running")
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

View File

@@ -38,6 +38,7 @@ FRAMEWORK = "pt" if _torch_available else "tf"
class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MarianTokenizer tokenizer_class = MarianTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -1,7 +1,7 @@
import tempfile import tempfile
import unittest import unittest
from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer, is_torch_available from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer, MBartTokenizerFast, is_torch_available
from transformers.testing_utils import require_torch from transformers.testing_utils import require_torch
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@@ -17,6 +17,8 @@ RO_CODE = 250020
class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MBartTokenizer tokenizer_class = MBartTokenizer
rust_tokenizer_class = MBartTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -18,7 +18,7 @@ import json
import os import os
import unittest import unittest
from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin
class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = OpenAIGPTTokenizer tokenizer_class = OpenAIGPTTokenizer
rust_tokenizer_class = OpenAIGPTTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -3,7 +3,7 @@ from pathlib import Path
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import require_torch from transformers.testing_utils import require_torch
from transformers.tokenization_pegasus import PegasusTokenizer from transformers.tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@@ -11,6 +11,8 @@ from .test_tokenization_common import TokenizerTesterMixin
class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PegasusTokenizer tokenizer_class = PegasusTokenizer
rust_tokenizer_class = PegasusTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -19,7 +19,7 @@ import unittest
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import require_torch, slow from transformers.testing_utils import require_torch, slow
from transformers.tokenization_reformer import SPIECE_UNDERLINE, ReformerTokenizer from transformers.tokenization_reformer import SPIECE_UNDERLINE, ReformerTokenizer, ReformerTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@@ -30,6 +30,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = ReformerTokenizer tokenizer_class = ReformerTokenizer
rust_tokenizer_class = ReformerTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -37,6 +39,28 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(self.tmpdirname)
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
def test_full_tokenizer(self): def test_full_tokenizer(self):
tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)

View File

@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin
class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = RobertaTokenizer tokenizer_class = RobertaTokenizer
rust_tokenizer_class = RobertaTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -20,13 +20,12 @@ import unittest
from transformers import BatchEncoding from transformers import BatchEncoding
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import _torch_available from transformers.testing_utils import _torch_available
from transformers.tokenization_t5 import T5Tokenizer from transformers.tokenization_t5 import T5Tokenizer, T5TokenizerFast
from transformers.tokenization_xlnet import SPIECE_UNDERLINE
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
SPIECE_UNDERLINE = ""
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
FRAMEWORK = "pt" if _torch_available else "tf" FRAMEWORK = "pt" if _torch_available else "tf"
@@ -35,6 +34,8 @@ FRAMEWORK = "pt" if _torch_available else "tf"
class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = T5Tokenizer tokenizer_class = T5Tokenizer
rust_tokenizer_class = T5TokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -113,6 +114,38 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def t5_base_tokenizer(self): def t5_base_tokenizer(self):
return T5Tokenizer.from_pretrained("t5-base") return T5Tokenizer.from_pretrained("t5-base")
@cached_property
def t5_base_tokenizer_fast(self):
return T5TokenizerFast.from_pretrained("t5-base")
def get_tokenizer(self, **kwargs) -> T5Tokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, pad_token=None, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> T5TokenizerFast:
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, pad_token=None, **kwargs)
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
def test_eos_treatment(self): def test_eos_treatment(self):
tokenizer = self.t5_base_tokenizer tokenizer = self.t5_base_tokenizer
batch_with_eos_added = tokenizer(["hi</s>", "I went to the gym</s>", "</s>"]) batch_with_eos_added = tokenizer(["hi</s>", "I went to the gym</s>", "</s>"])

View File

@@ -17,20 +17,15 @@
import os import os
import unittest import unittest
from transformers import is_torch_available from transformers.tokenization_transfo_xl import VOCAB_FILES_NAMES, TransfoXLTokenizer
from transformers.testing_utils import require_torch
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
if is_torch_available():
from transformers.tokenization_transfo_xl import VOCAB_FILES_NAMES, TransfoXLTokenizer
@require_torch
class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = TransfoXLTokenizer if is_torch_available() else None tokenizer_class = TransfoXLTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -27,6 +27,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMTokenizer tokenizer_class = XLMTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -19,7 +19,7 @@ import unittest
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import slow from transformers.testing_utils import slow
from transformers.tokenization_xlm_roberta import SPIECE_UNDERLINE, XLMRobertaTokenizer from transformers.tokenization_xlm_roberta import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@@ -30,6 +30,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMRobertaTokenizer tokenizer_class = XLMRobertaTokenizer
rust_tokenizer_class = XLMRobertaTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -118,6 +120,28 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def big_tokenizer(self): def big_tokenizer(self):
return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base") return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
@slow @slow
def test_tokenization_base_easy_symbols(self): def test_tokenization_base_easy_symbols(self):
symbols = "Hello World!" symbols = "Hello World!"

View File

@@ -18,7 +18,7 @@ import os
import unittest import unittest
from transformers.testing_utils import slow from transformers.testing_utils import slow
from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@@ -29,12 +29,15 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLNetTokenizer tokenizer_class = XLNetTokenizer
rust_tokenizer_class = XLNetTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.sanitize_special_tokens()
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(self.tmpdirname)
def test_full_tokenizer(self): def test_full_tokenizer(self):