From 7898fc03b1dc837af8ff6e671aa72372b8802a86 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 4 Feb 2021 03:34:23 -0500
Subject: [PATCH] Add `from_slow` in fast tokenizers build and fixes some bugs
(#9987)
---
src/transformers/models/albert/tokenization_albert.py | 5 +++--
.../models/albert/tokenization_albert_fast.py | 11 ++++++-----
src/transformers/models/auto/tokenization_auto.py | 1 +
src/transformers/models/bart/tokenization_bart.py | 8 +++-----
.../models/bart/tokenization_bart_fast.py | 7 +++++++
.../models/barthez/tokenization_barthez.py | 5 +++--
.../models/barthez/tokenization_barthez_fast.py | 3 ---
.../models/blenderbot/tokenization_blenderbot.py | 2 +-
.../models/camembert/tokenization_camembert.py | 5 +++--
.../models/camembert/tokenization_camembert_fast.py | 4 ----
src/transformers/models/t5/tokenization_t5.py | 4 ++++
.../xlm_prophetnet/tokenization_xlm_prophetnet.py | 5 +++--
.../models/xlm_roberta/tokenization_xlm_roberta.py | 5 +++--
.../xlm_roberta/tokenization_xlm_roberta_fast.py | 3 ---
src/transformers/tokenization_utils_base.py | 11 +++++------
src/transformers/tokenization_utils_fast.py | 9 ++++++++-
16 files changed, 50 insertions(+), 38 deletions(-)
diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py
index a9bb75e95f..890c7f8707 100644
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -103,8 +103,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
- Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
- conversion (string, tokens and IDs).
+ Attributes:
+ sp_model (:obj:`SentencePieceProcessor`):
+ The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py
index abc305e4f1..5cfa584386 100644
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -88,8 +88,11 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
Whether or not to keep accents when tokenizing.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
- .. note:: When building a sequence using special tokens, this is not the token that is used for the
- beginning of sequence. The token used is the :obj:`cls_token`.
+
+ .. note::
+
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
+ sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
that is used for the end of sequence. The token used is the :obj:`sep_token`.
@@ -107,9 +110,7 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
- modeling. This is the token which the model will try to predict. Attributes:
- sp_model (:obj:`SentencePieceProcessor`):
- The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+ modeling. This is the token which the model will try to predict.
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 06bac0c2ff..470e8ce8e8 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -252,6 +252,7 @@ NO_CONFIG_TOKENIZER = [
HerbertTokenizerFast,
PhobertTokenizer,
BarthezTokenizer,
+ BarthezTokenizerFast,
]
diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py
index 4a468d811e..eea85b00cd 100644
--- a/src/transformers/models/bart/tokenization_bart.py
+++ b/src/transformers/models/bart/tokenization_bart.py
@@ -38,11 +38,9 @@ class BartTokenizer(RobertaTokenizer):
r"""
Construct a BART tokenizer.
- :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new
- :meth:`~transformers.BartTokenizer.prepare_seq2seq_batch`
-
- Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
- initialization parameters and other methods.
+ :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass
+ :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
+ parameters and other methods.
"""
# merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py
index 87ae615821..83fca126fa 100644
--- a/src/transformers/models/bart/tokenization_bart_fast.py
+++ b/src/transformers/models/bart/tokenization_bart_fast.py
@@ -37,6 +37,13 @@ _all_bart_models = [
class BartTokenizerFast(RobertaTokenizerFast):
+ r"""
+ Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library).
+
+ :class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer to
+ superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
+ initialization parameters and other methods.
+ """
# merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
pretrained_vocab_files_map = {
diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py
index f9d13681da..d751de0e0c 100644
--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -90,8 +90,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
- Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
- conversion (string, tokens and IDs).
+ Attributes:
+ sp_model (:obj:`SentencePieceProcessor`):
+ The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py
index 2d669ff8db..070d6e6c7e 100644
--- a/src/transformers/models/barthez/tokenization_barthez_fast.py
+++ b/src/transformers/models/barthez/tokenization_barthez_fast.py
@@ -98,9 +98,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
-
- Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
- conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py
index c502f73e8c..725f31605d 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -36,7 +36,7 @@ class BlenderbotTokenizer(RobertaTokenizer):
Construct a Blenderbot tokenizer.
:class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
- end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS token
+ end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
to the beginning of sequences.
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py
index b119292cef..0be12e76be 100644
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -93,8 +93,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
- Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
- conversion (string, tokens and IDs).
+ Attributes:
+ sp_model (:obj:`SentencePieceProcessor`):
+ The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py
index 813eedcdc3..437fa77173 100644
--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -101,10 +101,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
-
- Attributes:
- sp_model (:obj:`SentencePieceProcessor`):
- The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index 39d0783f4c..4dcd51d494 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -92,6 +92,10 @@ class T5Tokenizer(PreTrainedTokenizer):
`__).
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
+
+ Attributes:
+ sp_model (:obj:`SentencePieceProcessor`):
+ The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
index b3ec309968..c621070522 100644
--- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
@@ -97,8 +97,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
- Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
- conversion (string, tokens and IDs).
+ Attributes:
+ sp_model (:obj:`SentencePieceProcessor`):
+ The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
index b7a00494ea..2d85e30dd0 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -95,8 +95,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
- Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
- conversion (string, tokens and IDs).
+ Attributes:
+ sp_model (:obj:`SentencePieceProcessor`):
+ The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
index 902aa2d274..befd84be94 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
@@ -106,9 +106,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`):
Additional special tokens used by the tokenizer.
-
- Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
- conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 4c801f9144..425942668c 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1793,12 +1793,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def _from_pretrained(
cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
):
- # We instantiate fast tokenizers based on a slow tokenizer for now
- # In the future we can also use a direct way based on saving/instantiating
- # tokenizer's Tokenizer directly from it's serialization JSON
- if (
- "tokenizer_file" not in resolved_vocab_files or resolved_vocab_files["tokenizer_file"] is None
- ) and cls.slow_tokenizer_class is not None:
+ # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
+ # file or if `from_slow` is set to True.
+ from_slow = kwargs.get("from_slow", False)
+ has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
+ if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
copy.deepcopy(resolved_vocab_files),
pretrained_model_name_or_path,
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 92388507d2..4ee82a4552 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -80,8 +80,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
def __init__(self, *args, **kwargs):
slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
+ from_slow = kwargs.pop("from_slow", False)
- if fast_tokenizer_file is not None:
+ if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
+ raise ValueError(
+ "Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
+ "have sentencepiece installed."
+ )
+
+ if fast_tokenizer_file is not None and not from_slow:
# We have a serialization from tokenizers which let us directly build the backend
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
elif slow_tokenizer is not None: