From 9f4e0c23d68366985f9f584388874477ad6472d8 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 5 Apr 2021 10:51:16 -0400 Subject: [PATCH] Documentation about loading a fast tokenizer within Transformers (#11029) * Documentation about loading a fast tokenizer within Transformers * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * style Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- docs/source/fast_tokenizers.rst | 62 +++++++++++++++++++++ docs/source/index.rst | 1 + docs/source/main_classes/tokenizer.rst | 5 ++ src/transformers/tokenization_utils_fast.py | 19 +++++-- tests/test_tokenization_utils.py | 31 ++++++++++- 5 files changed, 111 insertions(+), 7 deletions(-) create mode 100644 docs/source/fast_tokenizers.rst diff --git a/docs/source/fast_tokenizers.rst b/docs/source/fast_tokenizers.rst new file mode 100644 index 0000000000..52584b7eb4 --- /dev/null +++ b/docs/source/fast_tokenizers.rst @@ -0,0 +1,62 @@ +Using tokenizers from 🤗 Tokenizers +======================================================================================================================= + +The :class:`~transformers.PreTrainedTokenizerFast` depends on the `tokenizers +`__ library. The tokenizers obtained from the 🤗 Tokenizers library can be +loaded very simply into 🤗 Transformers. + +Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines: + +.. code-block:: + + >>> from tokenizers import Tokenizer + >>> from tokenizers.models import BPE + >>> from tokenizers.trainers import BpeTrainer + >>> from tokenizers.pre_tokenizers import Whitespace + + >>> tokenizer = Tokenizer(BPE(unk_token="[UNK]")) + >>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) + + >>> tokenizer.pre_tokenizer = Whitespace() + >>> files = [...] + >>> tokenizer.train(files, trainer) + +We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to +a JSON file for future re-use. + +Loading directly from the tokenizer object +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The +:class:`~transformers.PreTrainedTokenizerFast` class allows for easy instantiation, by accepting the instantiated +`tokenizer` object as an argument: + +.. code-block:: + + >>> from transformers import PreTrainedTokenizerFast + + >>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer) + +This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer +page ` for more information. + +Loading from a JSON file +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer: + +.. code-block:: + + >>> tokenizer.save("tokenizer.json") + +The path to which we saved this file can be passed to the :class:`~transformers.PreTrainedTokenizerFast` initialization +method using the :obj:`tokenizer_file` parameter: + +.. code-block:: + + >>> from transformers import PreTrainedTokenizerFast + + >>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") + +This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer +page ` for more information. diff --git a/docs/source/index.rst b/docs/source/index.rst index 16164a761a..9692abcde9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -384,6 +384,7 @@ TensorFlow and/or Flax. migration contributing add_new_model + fast_tokenizers testing serialization diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst index 3bd9b3a966..26cde90b32 100644 --- a/docs/source/main_classes/tokenizer.rst +++ b/docs/source/main_classes/tokenizer.rst @@ -62,6 +62,11 @@ PreTrainedTokenizer PreTrainedTokenizerFast ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The :class:`~transformers.PreTrainedTokenizerFast` depend on the `tokenizers +`__ library. The tokenizers obtained from the 🤗 tokenizers library can be +loaded very simply into 🤗 transformers. Take a look at the :doc:`Using tokenizers from 🤗 tokenizers +<../fast_tokenizers>` page to understand how this is done. + .. autoclass:: transformers.PreTrainedTokenizerFast :special-members: __call__ :members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode, diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 901447d568..706ee7e22c 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -54,6 +54,12 @@ TOKENIZER_CONFIG_FILE = "tokenizer_config.json" # Slow tokenizers have an additional added tokens files ADDED_TOKENS_FILE = "added_tokens.json" +INIT_TOKENIZER_DOCSTRING += """ + tokenizer_object (:class:`tokenizers.Tokenizer`): + A :class:`tokenizers.Tokenizer` object from 🤗 tokenizers to instantiate from. See :doc:`Using tokenizers + from 🤗 tokenizers <../fast_tokenizers>` for more information. +""" + @add_end_docstrings(INIT_TOKENIZER_DOCSTRING) class PreTrainedTokenizerFast(PreTrainedTokenizerBase): @@ -72,6 +78,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): slow_tokenizer_class: PreTrainedTokenizer = None def __init__(self, *args, **kwargs): + tokenizer_object = kwargs.pop("tokenizer_object", None) slow_tokenizer = kwargs.pop("__slow_tokenizer", None) fast_tokenizer_file = kwargs.pop("tokenizer_file", None) from_slow = kwargs.pop("from_slow", False) @@ -82,7 +89,9 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): "have sentencepiece installed." ) - if fast_tokenizer_file is not None and not from_slow: + if tokenizer_object is not None: + fast_tokenizer = tokenizer_object + elif fast_tokenizer_file is not None and not from_slow: # We have a serialization from tokenizers which let us directly build the backend fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file) elif slow_tokenizer is not None: @@ -94,10 +103,10 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) else: raise ValueError( - "Couldn't instantiate the backend tokenizer from one of: " - "(1) a `tokenizers` library serialization file, " - "(2) a slow tokenizer instance to convert or " - "(3) an equivalent slow tokenizer class to instantiate and convert. " + "Couldn't instantiate the backend tokenizer from one of: \n" + "(1) a `tokenizers` library serialization file, \n" + "(2) a slow tokenizer instance to convert or \n" + "(3) an equivalent slow tokenizer class to instantiate and convert. \n" "You need to have sentencepiece installed to convert a slow tokenizer to a fast one." ) diff --git a/tests/test_tokenization_utils.py b/tests/test_tokenization_utils.py index 7401d183e6..534d945458 100644 --- a/tests/test_tokenization_utils.py +++ b/tests/test_tokenization_utils.py @@ -12,18 +12,33 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import os import pickle +import tempfile import unittest from typing import Callable, Optional import numpy as np -from transformers import BatchEncoding, BertTokenizer, BertTokenizerFast, PreTrainedTokenizer, TensorType, TokenSpan +from transformers import ( + BatchEncoding, + BertTokenizer, + BertTokenizerFast, + PreTrainedTokenizer, + PreTrainedTokenizerFast, + TensorType, + TokenSpan, + is_tokenizers_available, +) from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer from transformers.testing_utils import CaptureStderr, require_flax, require_tf, require_tokenizers, require_torch, slow +if is_tokenizers_available(): + from tokenizers import Tokenizer + from tokenizers.models import WordPiece + + class TokenizerUtilsTest(unittest.TestCase): def check_tokenizer_from_pretrained(self, tokenizer_class): s3_models = list(tokenizer_class.max_model_input_sizes.keys()) @@ -253,3 +268,15 @@ class TokenizerUtilsTest(unittest.TestCase): batch = tokenizer.pad(features, padding=True, return_tensors="tf") self.assertTrue(isinstance(batch["input_ids"], tf.Tensor)) self.assertEqual(batch["input_ids"].numpy().tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]]) + + @require_tokenizers + def test_instantiation_from_tokenizers(self): + bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) + PreTrainedTokenizerFast(tokenizer_object=bert_tokenizer) + + @require_tokenizers + def test_instantiation_from_tokenizers_json_file(self): + bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) + with tempfile.TemporaryDirectory() as tmpdirname: + bert_tokenizer.save(os.path.join(tmpdirname, "tokenizer.json")) + PreTrainedTokenizerFast(tokenizer_file=os.path.join(tmpdirname, "tokenizer.json"))