From 5b570754495cbf301f0f0d4a49849b28c66a687c Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Wed, 22 Sep 2021 19:00:47 -0400 Subject: [PATCH] Add BlenderBot small tokenizer to the init (#13367) * Add BlenderBot small tokenizer to the init * Update src/transformers/__init__.py Co-authored-by: Suraj Patil * Style * Bugfix Co-authored-by: Suraj Patil --- docs/source/index.rst | 2 +- docs/source/model_doc/blenderbot_small.rst | 7 +++++++ src/transformers/__init__.py | 2 ++ src/transformers/models/blenderbot_small/__init__.py | 8 +++++++- .../tokenization_blenderbot_small_fast.py | 4 ++-- src/transformers/utils/dummy_tokenizers_objects.py | 9 +++++++++ 6 files changed, 28 insertions(+), 4 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index f4cd257a5b..5521ec44d5 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -354,7 +354,7 @@ Flax), PyTorch, and/or TensorFlow. +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | Blenderbot | ✅ | ❌ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ -| BlenderbotSmall | ✅ | ❌ | ✅ | ✅ | ❌ | +| BlenderbotSmall | ✅ | ✅ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ diff --git a/docs/source/model_doc/blenderbot_small.rst b/docs/source/model_doc/blenderbot_small.rst index 4d2a5339c3..5e64ac0970 100644 --- a/docs/source/model_doc/blenderbot_small.rst +++ b/docs/source/model_doc/blenderbot_small.rst @@ -57,6 +57,13 @@ BlenderbotSmallTokenizer create_token_type_ids_from_sequences, save_vocabulary +BlenderbotSmallTokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.BlenderbotSmallTokenizerFast + :members: + + BlenderbotSmallModel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 07dd405972..66415e60cb 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -370,6 +370,7 @@ if is_tokenizers_available(): _import_structure["models.roformer"].append("RoFormerTokenizerFast") _import_structure["models.clip"].append("CLIPTokenizerFast") _import_structure["models.convbert"].append("ConvBertTokenizerFast") + _import_structure["models.blenderbot_small"].append("BlenderbotSmallTokenizerFast") _import_structure["models.albert"].append("AlbertTokenizerFast") _import_structure["models.bart"].append("BartTokenizerFast") _import_structure["models.barthez"].append("BarthezTokenizerFast") @@ -2182,6 +2183,7 @@ if TYPE_CHECKING: from .models.barthez import BarthezTokenizerFast from .models.bert import BertTokenizerFast from .models.big_bird import BigBirdTokenizerFast + from .models.blenderbot_small import BlenderbotSmallTokenizerFast from .models.camembert import CamembertTokenizerFast from .models.clip import CLIPTokenizerFast from .models.convbert import ConvBertTokenizerFast diff --git a/src/transformers/models/blenderbot_small/__init__.py b/src/transformers/models/blenderbot_small/__init__.py index febe5ae90a..f9447d8a80 100644 --- a/src/transformers/models/blenderbot_small/__init__.py +++ b/src/transformers/models/blenderbot_small/__init__.py @@ -17,7 +17,7 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...file_utils import _LazyModule, is_tf_available, is_torch_available +from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available _import_structure = { @@ -25,6 +25,9 @@ _import_structure = { "tokenization_blenderbot_small": ["BlenderbotSmallTokenizer"], } +if is_tokenizers_available(): + _import_structure["tokenization_blenderbot_small_fast"] = ["BlenderbotSmallTokenizerFast"] + if is_torch_available(): _import_structure["modeling_blenderbot_small"] = [ "BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -45,6 +48,9 @@ if TYPE_CHECKING: from .configuration_blenderbot_small import BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotSmallConfig from .tokenization_blenderbot_small import BlenderbotSmallTokenizer + if is_tokenizers_available(): + from .tokenization_blenderbot_small_fast import BlenderbotSmallTokenizerFast + if is_torch_available(): from .modeling_blenderbot_small import ( BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST, diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py index 0068eba311..2867b598b7 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py @@ -74,8 +74,8 @@ class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast): ): super().__init__( ByteLevelBPETokenizer( - vocab_file=vocab_file, - merges_file=merges_file, + vocab=vocab_file, + merges=merges_file, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets, ), diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py index b2d4feb0d1..eb79f72d70 100644 --- a/src/transformers/utils/dummy_tokenizers_objects.py +++ b/src/transformers/utils/dummy_tokenizers_objects.py @@ -47,6 +47,15 @@ class BigBirdTokenizerFast: requires_backends(cls, ["tokenizers"]) +class BlenderbotSmallTokenizerFast: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tokenizers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tokenizers"]) + + class CamembertTokenizerFast: def __init__(self, *args, **kwargs): requires_backends(self, ["tokenizers"])