[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)

* splitting fast and slow tokenizers [WIP] * [WIP] splitting sentencepiece and tokenizers dependencies * update dummy objects * add name_or_path to models and tokenizers * prefix added to file names * prefix * styling + quality * spliting all the tokenizer files - sorting sentencepiece based ones * update tokenizer version up to 0.9.0 * remove hard dependency on sentencepiece 🎉 * and removed hard dependency on tokenizers 🎉 * update conversion script * update missing models * fixing tests * move test_tokenization_fast to main tokenization tests - fix bugs * bump up tokenizers * fix bert_generation * update ad fix several tokenizers * keep sentencepiece in deps for now * fix funnel and deberta tests * fix fsmt * fix marian tests * fix layoutlm * fix squeezebert and gpt2 * fix T5 tokenization * fix xlnet tests * style * fix mbart * bump up tokenizers to 0.9.2 * fix model tests * fix tf models * fix seq2seq examples * fix tests without sentencepiece * fix slow => fast conversion without sentencepiece * update auto and bert generation tests * fix mbart tests * fix auto and common test without tokenizers * fix tests without tokenizers * clean up tests lighten up when tokenizers + sentencepiece are both off * style quality and tests fixing * add sentencepiece to doc/examples reqs * leave sentencepiece on for now * style quality split hebert and fix pegasus * WIP Herbert fast * add sample_text_no_unicode and fix hebert tokenization * skip FSMT example test for now * fix style * fix fsmt in example tests * update following Lysandre and Sylvain's comments * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2020-10-18 20:51:24 +02:00
parent c65863ce53
commit ba8c4d0ac0
140 changed files with 6551 additions and 3961 deletions
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -157,6 +157,24 @@ except (AttributeError, ImportError, KeyError):
    _in_notebook = False


+try:
+    import sentencepiece  # noqa: F401
+
+    _sentencepiece_available = True
+
+except ImportError:
+    _sentencepiece_available = False
+
+
+try:
+    import tokenizers  # noqa: F401
+
+    _tokenizers_available = True
+
+except ImportError:
+    _tokenizers_available = False
+
+
 default_cache_path = os.path.join(torch_cache_home, "transformers")


@@ -170,6 +188,8 @@ TF_WEIGHTS_NAME = "model.ckpt"
 CONFIG_NAME = "config.json"
 MODEL_CARD_NAME = "modelcard.json"

+SENTENCEPIECE_UNDERLINE = "▁"
+SPIECE_UNDERLINE = SENTENCEPIECE_UNDERLINE  # Kept for backward compatibility

 MULTIPLE_CHOICE_DUMMY_INPUTS = [
    [[0, 1, 0, 1], [1, 0, 0, 1]]
@@ -217,6 +237,18 @@ def is_faiss_available():
    return _faiss_available


+def is_sklearn_available():
+    return _has_sklearn
+
+
+def is_sentencepiece_available():
+    return _sentencepiece_available
+
+
+def is_tokenizers_available():
+    return _tokenizers_available
+
+
 def is_in_notebook():
    return _in_notebook

@@ -234,10 +266,6 @@ def torch_only_method(fn):
    return wrapper


-def is_sklearn_available():
-    return _has_sklearn
-
-
 DATASETS_IMPORT_ERROR = """
 {0} requires the 🤗 Datasets library but it was not found in your enviromnent. You can install it with:
 ```
@@ -255,6 +283,25 @@ that python file if that's the case.
 """


+TOKENIZERS_IMPORT_ERROR = """
+{0} requires the 🤗 Tokenizers library but it was not found in your enviromnent. You can install it with:
+```
+pip install tokenizers
+```
+In a notebook or a colab, you can install it by executing a cell with
+```
+!pip install tokenizers
+```
+"""
+
+
+SENTENCEPIECE_IMPORT_ERROR = """
+{0} requires the SentencePiece library but it was not found in your enviromnent. Checkout the instructions on the
+installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
+that match your enviromnent.
+"""
+
+
 FAISS_IMPORT_ERROR = """
 {0} requires the faiss library but it was not found in your enviromnent. Checkout the instructions on the
 installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
@@ -316,6 +363,18 @@ def requires_tf(obj):
        raise ImportError(TENSORFLOW_IMPORT_ERROR.format(name))


+def requires_tokenizers(obj):
+    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
+    if not is_tokenizers_available():
+        raise ImportError(TOKENIZERS_IMPORT_ERROR.format(name))
+
+
+def requires_sentencepiece(obj):
+    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
+    if not is_sentencepiece_available():
+        raise ImportError(SENTENCEPIECE_IMPORT_ERROR.format(name))
+
+
 def add_start_docstrings(*docstr):
    def docstring_decorator(fn):
        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")