[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)

* splitting fast and slow tokenizers [WIP] * [WIP] splitting sentencepiece and tokenizers dependencies * update dummy objects * add name_or_path to models and tokenizers * prefix added to file names * prefix * styling + quality * spliting all the tokenizer files - sorting sentencepiece based ones * update tokenizer version up to 0.9.0 * remove hard dependency on sentencepiece 🎉 * and removed hard dependency on tokenizers 🎉 * update conversion script * update missing models * fixing tests * move test_tokenization_fast to main tokenization tests - fix bugs * bump up tokenizers * fix bert_generation * update ad fix several tokenizers * keep sentencepiece in deps for now * fix funnel and deberta tests * fix fsmt * fix marian tests * fix layoutlm * fix squeezebert and gpt2 * fix T5 tokenization * fix xlnet tests * style * fix mbart * bump up tokenizers to 0.9.2 * fix model tests * fix tf models * fix seq2seq examples * fix tests without sentencepiece * fix slow => fast conversion without sentencepiece * update auto and bert generation tests * fix mbart tests * fix auto and common test without tokenizers * fix tests without tokenizers * clean up tests lighten up when tokenizers + sentencepiece are both off * style quality and tests fixing * add sentencepiece to doc/examples reqs * leave sentencepiece on for now * style quality split hebert and fix pegasus * WIP Herbert fast * add sample_text_no_unicode and fix hebert tokenization * skip FSMT example test for now * fix style * fix fsmt in example tests * update following Lysandre and Sylvain's comments * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2020-10-18 20:51:24 +02:00
parent c65863ce53
commit ba8c4d0ac0
140 changed files with 6551 additions and 3961 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -23,20 +23,19 @@ import json
 import os
 import warnings
 from collections import OrderedDict, UserDict
+from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union

 import numpy as np

-from tokenizers import AddedToken
-from tokenizers import Encoding as EncodingFast
-
 from .file_utils import (
    add_end_docstrings,
    cached_path,
    hf_bucket_url,
    is_remote_url,
    is_tf_available,
+    is_tokenizers_available,
    is_torch_available,
    torch_required,
 )
@@ -45,9 +44,36 @@ from .utils import logging

 if is_tf_available():
    import tensorflow as tf
+
 if is_torch_available():
    import torch

+if is_tokenizers_available():
+    from tokenizers import AddedToken
+    from tokenizers import Encoding as EncodingFast
+else:
+
+    @dataclass(frozen=True, eq=True)
+    class AddedToken:
+        """AddedToken represents a token to be added to a Tokenizer
+        An AddedToken can have special options defining the way it should behave.
+        """
+
+        content: str = field(default_factory=str)
+        single_word: bool = False
+        lstrip: bool = False
+        rstrip: bool = False
+        normalized: bool = True
+
+        def __getstate__(self):
+            return self.__dict__
+
+    @dataclass
+    class EncodingFast:
+        """ This is dummy class because without the `tokenizers` library we don't have these objects anyway """
+
+        pass
+

 logger = logging.get_logger(__name__)

@@ -1304,6 +1330,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
        self.init_inputs = ()
        self.init_kwargs = copy.deepcopy(kwargs)
+        self.name_or_path = kwargs.pop("name_or_path", "")

        # For backward compatibility we fallback to set model_max_length from max_len if provided
        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
@@ -1377,6 +1404,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
            )

+    def __repr__(self) -> str:
+        return (
+            f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
+            f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
+            f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
+        )
+
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
        r"""
@@ -1562,7 +1596,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        # We instantiate fast tokenizers based on a slow tokenizer for now
        # In the future we can also use a direct way based on saving/instantiating
        # tokenizer's Tokenizer directly from it's serialization JSON
-        if cls.slow_tokenizer_class is not None:
+        if (
+            "tokenizer_file" not in resolved_vocab_files or resolved_vocab_files["tokenizer_file"] is None
+        ) and cls.slow_tokenizer_class is not None:
            slow_tokenizer = cls.slow_tokenizer_class._from_pretrained(
                copy.deepcopy(resolved_vocab_files),
                pretrained_model_name_or_path,
@@ -1618,6 +1654,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        if slow_tokenizer is not None:
            init_kwargs["__slow_tokenizer"] = slow_tokenizer

+        init_kwargs["name_or_path"] = pretrained_model_name_or_path
+
        # Instantiate tokenizer.
        try:
            tokenizer = cls(*init_inputs, **init_kwargs)
@@ -1669,7 +1707,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

        return tokenizer

-    def save_pretrained(self, save_directory: str) -> Tuple[str]:
+    def save_pretrained(
+        self, save_directory: str, legacy_format: bool = True, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
        """
        Save the full tokenizer state.

@@ -1688,7 +1728,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
           modifying :obj:`tokenizer.do_lower_case` after creation).

        Args:
-            save_directory (:obj:`str`): The path to a directory where the tokenizer will be saved.
+            save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
+            legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and
+                a separate added_tokens files or in the unified JSON file format for the `tokenizers` library.
+                It's only possible to save a Fast tokenizer in the unified JSON format and this format is incompatible
+                with "slow" tokenizers (not powered by the `tokenizers` library).
+            filename_prefix: (:obj:`str`, `optional`):
+                A prefix to add to the names of the files saved by the tokenizer.

        Returns:
            A tuple of :obj:`str`: The files saved.
@@ -1698,8 +1745,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            return
        os.makedirs(save_directory, exist_ok=True)

-        special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
-        tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
+        special_tokens_map_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
+        )
+        tokenizer_config_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
+        )

        tokenizer_config = copy.deepcopy(self.init_kwargs)
        if len(self.init_inputs) > 0:
@@ -1732,19 +1783,61 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):

        file_names = (tokenizer_config_file, special_tokens_map_file)

-        return self._save_pretrained(save_directory, file_names)
+        return self._save_pretrained(
+            save_directory=save_directory,
+            file_names=file_names,
+            legacy_format=legacy_format,
+            filename_prefix=filename_prefix,
+        )

-    def _save_pretrained(self, save_directory: str, file_names: Tuple[str]) -> Tuple[str]:
-        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
+    def _save_pretrained(
+        self,
+        save_directory: str,
+        file_names: Tuple[str],
+        legacy_format: bool = True,
+        filename_prefix: Optional[str] = None,
+    ) -> Tuple[str]:
+        """Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
+
+        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens}
+        using the specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
+        """
+        if not legacy_format:
+            raise ValueError(
+                "Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format."
+            )
+
+        added_tokens_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
+        )
        added_vocab = self.get_added_vocab()
        if added_vocab:
            with open(added_tokens_file, "w", encoding="utf-8") as f:
                out_str = json.dumps(added_vocab, ensure_ascii=False)
                f.write(out_str)

-        vocab_files = self.save_vocabulary(save_directory)
+        vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)

-        return file_names + (vocab_files, added_tokens_file)
+        return file_names + vocab_files + (added_tokens_file,)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary + added tokens).
+
+        This method won't save the configuration and special token mappings of the tokenizer.
+        Use :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save
+        the whole state of the tokenizer.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (:obj:`str`, `optional`):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        raise NotImplementedError

    @add_end_docstrings(
        ENCODE_KWARGS_DOCSTRING,