[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)

* splitting fast and slow tokenizers [WIP] * [WIP] splitting sentencepiece and tokenizers dependencies * update dummy objects * add name_or_path to models and tokenizers * prefix added to file names * prefix * styling + quality * spliting all the tokenizer files - sorting sentencepiece based ones * update tokenizer version up to 0.9.0 * remove hard dependency on sentencepiece 🎉 * and removed hard dependency on tokenizers 🎉 * update conversion script * update missing models * fixing tests * move test_tokenization_fast to main tokenization tests - fix bugs * bump up tokenizers * fix bert_generation * update ad fix several tokenizers * keep sentencepiece in deps for now * fix funnel and deberta tests * fix fsmt * fix marian tests * fix layoutlm * fix squeezebert and gpt2 * fix T5 tokenization * fix xlnet tests * style * fix mbart * bump up tokenizers to 0.9.2 * fix model tests * fix tf models * fix seq2seq examples * fix tests without sentencepiece * fix slow => fast conversion without sentencepiece * update auto and bert generation tests * fix mbart tests * fix auto and common test without tokenizers * fix tests without tokenizers * clean up tests lighten up when tokenizers + sentencepiece are both off * style quality and tests fixing * add sentencepiece to doc/examples reqs * leave sentencepiece on for now * style quality split hebert and fix pegasus * WIP Herbert fast * add sample_text_no_unicode and fix hebert tokenization * skip FSMT example test for now * fix style * fix fsmt in example tests * update following Lysandre and Sylvain's comments * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/testing_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update src/transformers/tokenization_utils_base.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2020-10-18 20:51:24 +02:00
parent c65863ce53
commit ba8c4d0ac0
140 changed files with 6551 additions and 3961 deletions
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -49,6 +49,7 @@ def {0}(*args, **kwargs):
    requires_pytorch({0})
 """

+
 DUMMY_TF_PRETRAINED_CLASS = """
 class {0}:
    def __init__(self, *args, **kwargs):
@@ -71,12 +72,111 @@ def {0}(*args, **kwargs):
 """


+DUMMY_SENTENCEPIECE_PRETRAINED_CLASS = """
+class {0}:
+    def __init__(self, *args, **kwargs):
+        requires_sentencepiece(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_sentencepiece(self)
+"""
+
+DUMMY_SENTENCEPIECE_CLASS = """
+class {0}:
+    def __init__(self, *args, **kwargs):
+        requires_sentencepiece(self)
+"""
+
+DUMMY_SENTENCEPIECE_FUNCTION = """
+def {0}(*args, **kwargs):
+    requires_sentencepiece({0})
+"""
+
+
+DUMMY_TOKENIZERS_PRETRAINED_CLASS = """
+class {0}:
+    def __init__(self, *args, **kwargs):
+        requires_tokenizers(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_tokenizers(self)
+"""
+
+DUMMY_TOKENIZERS_CLASS = """
+class {0}:
+    def __init__(self, *args, **kwargs):
+        requires_tokenizers(self)
+"""
+
+DUMMY_TOKENIZERS_FUNCTION = """
+def {0}(*args, **kwargs):
+    requires_tokenizers({0})
+"""
+
+# Map all these to dummy type
+
+DUMMY_PRETRAINED_CLASS = {
+    "pt": DUMMY_PT_PRETRAINED_CLASS,
+    "tf": DUMMY_TF_PRETRAINED_CLASS,
+    "sentencepiece": DUMMY_SENTENCEPIECE_PRETRAINED_CLASS,
+    "tokenizers": DUMMY_TOKENIZERS_PRETRAINED_CLASS,
+}
+
+DUMMY_CLASS = {
+    "pt": DUMMY_PT_CLASS,
+    "tf": DUMMY_TF_CLASS,
+    "sentencepiece": DUMMY_SENTENCEPIECE_CLASS,
+    "tokenizers": DUMMY_TOKENIZERS_CLASS,
+}
+
+DUMMY_FUNCTION = {
+    "pt": DUMMY_PT_FUNCTION,
+    "tf": DUMMY_TF_FUNCTION,
+    "sentencepiece": DUMMY_SENTENCEPIECE_FUNCTION,
+    "tokenizers": DUMMY_TOKENIZERS_FUNCTION,
+}
+
+
 def read_init():
-    """ Read the init and exctracts PyTorch and TensorFlow objects. """
+    """ Read the init and exctracts PyTorch, TensorFlow, SentencePiece and Tokenizers objects. """
    with open(os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"), "r", encoding="utf-8") as f:
        lines = f.readlines()

    line_index = 0
+    # Find where the SentencePiece imports begin
+    sentencepiece_objects = []
+    while not lines[line_index].startswith("if is_sentencepiece_available():"):
+        line_index += 1
+    line_index += 1
+
+    # Until we unindent, add SentencePiece objects to the list
+    while len(lines[line_index]) <= 1 or lines[line_index].startswith("    "):
+        line = lines[line_index]
+        search = _re_single_line_import.search(line)
+        if search is not None:
+            sentencepiece_objects += search.groups()[0].split(", ")
+        elif line.startswith("        "):
+            sentencepiece_objects.append(line[8:-2])
+        line_index += 1
+
+    # Find where the Tokenizers imports begin
+    tokenizers_objects = []
+    while not lines[line_index].startswith("if is_tokenizers_available():"):
+        line_index += 1
+    line_index += 1
+
+    # Until we unindent, add Tokenizers objects to the list
+    while len(lines[line_index]) <= 1 or lines[line_index].startswith("    "):
+        line = lines[line_index]
+        search = _re_single_line_import.search(line)
+        if search is not None:
+            tokenizers_objects += search.groups()[0].split(", ")
+        elif line.startswith("        "):
+            tokenizers_objects.append(line[8:-2])
+        line_index += 1
+
    # Find where the PyTorch imports begin
    pt_objects = []
    while not lines[line_index].startswith("if is_torch_available():"):
@@ -108,10 +208,10 @@ def read_init():
        elif line.startswith("        "):
            tf_objects.append(line[8:-2])
        line_index += 1
-    return pt_objects, tf_objects
+    return sentencepiece_objects, tokenizers_objects, pt_objects, tf_objects


-def create_dummy_object(name, is_pytorch=True):
+def create_dummy_object(name, type="pt"):
    """ Create the code for the dummy object corresponding to `name`."""
    _pretrained = [
        "Config" "ForCausalLM",
@@ -124,10 +224,11 @@ def create_dummy_object(name, is_pytorch=True):
        "Model",
        "Tokenizer",
    ]
+    assert type in ["pt", "tf", "sentencepiece", "tokenizers"]
    if name.isupper():
        return DUMMY_CONSTANT.format(name)
    elif name.islower():
-        return (DUMMY_PT_FUNCTION if is_pytorch else DUMMY_TF_FUNCTION).format(name)
+        return (DUMMY_FUNCTION[type]).format(name)
    else:
        is_pretrained = False
        for part in _pretrained:
@@ -135,39 +236,75 @@ def create_dummy_object(name, is_pytorch=True):
                is_pretrained = True
                break
        if is_pretrained:
-            template = DUMMY_PT_PRETRAINED_CLASS if is_pytorch else DUMMY_TF_PRETRAINED_CLASS
+            template = DUMMY_PRETRAINED_CLASS[type]
        else:
-            template = DUMMY_PT_CLASS if is_pytorch else DUMMY_TF_CLASS
+            template = DUMMY_CLASS[type]
        return template.format(name)


 def create_dummy_files():
    """ Create the content of the dummy files. """
-    pt_objects, tf_objects = read_init()
+    sentencepiece_objects, tokenizers_objects, pt_objects, tf_objects = read_init()
+
+    sentencepiece_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
+    sentencepiece_dummies += "from ..file_utils import requires_sentencepiece\n\n"
+    sentencepiece_dummies += "\n".join([create_dummy_object(o, type="sentencepiece") for o in sentencepiece_objects])
+
+    tokenizers_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
+    tokenizers_dummies += "from ..file_utils import requires_tokenizers\n\n"
+    tokenizers_dummies += "\n".join([create_dummy_object(o, type="tokenizers") for o in tokenizers_objects])

    pt_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
    pt_dummies += "from ..file_utils import requires_pytorch\n\n"
-    pt_dummies += "\n".join([create_dummy_object(o) for o in pt_objects])
+    pt_dummies += "\n".join([create_dummy_object(o, type="pt") for o in pt_objects])

    tf_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
    tf_dummies += "from ..file_utils import requires_tf\n\n"
-    tf_dummies += "\n".join([create_dummy_object(o, False) for o in tf_objects])
+    tf_dummies += "\n".join([create_dummy_object(o, type="tf") for o in tf_objects])

-    return pt_dummies, tf_dummies
+    return sentencepiece_dummies, tokenizers_dummies, pt_dummies, tf_dummies


 def check_dummies(overwrite=False):
    """ Check if the dummy files are up to date and maybe `overwrite` with the right content. """
-    pt_dummies, tf_dummies = create_dummy_files()
+    sentencepiece_dummies, tokenizers_dummies, pt_dummies, tf_dummies = create_dummy_files()
    path = os.path.join(PATH_TO_TRANSFORMERS, "utils")
+    sentencepiece_file = os.path.join(path, "dummy_sentencepiece_objects.py")
+    tokenizers_file = os.path.join(path, "dummy_tokenizers_objects.py")
    pt_file = os.path.join(path, "dummy_pt_objects.py")
    tf_file = os.path.join(path, "dummy_tf_objects.py")

+    with open(sentencepiece_file, "r", encoding="utf-8") as f:
+        actual_sentencepiece_dummies = f.read()
+    with open(tokenizers_file, "r", encoding="utf-8") as f:
+        actual_tokenizers_dummies = f.read()
    with open(pt_file, "r", encoding="utf-8") as f:
        actual_pt_dummies = f.read()
    with open(tf_file, "r", encoding="utf-8") as f:
        actual_tf_dummies = f.read()

+    if sentencepiece_dummies != actual_sentencepiece_dummies:
+        if overwrite:
+            print("Updating transformers.utils.dummy_sentencepiece_objects.py as the main __init__ has new objects.")
+            with open(sentencepiece_file, "w", encoding="utf-8") as f:
+                f.write(sentencepiece_dummies)
+        else:
+            raise ValueError(
+                "The main __init__ has objects that are not present in transformers.utils.dummy_sentencepiece_objects.py.",
+                "Run `make fix-copies` to fix this.",
+            )
+
+    if tokenizers_dummies != actual_tokenizers_dummies:
+        if overwrite:
+            print("Updating transformers.utils.dummy_tokenizers_objects.py as the main __init__ has new objects.")
+            with open(tokenizers_file, "w", encoding="utf-8") as f:
+                f.write(tokenizers_dummies)
+        else:
+            raise ValueError(
+                "The main __init__ has objects that are not present in transformers.utils.dummy_tokenizers_objects.py.",
+                "Run `make fix-copies` to fix this.",
+            )
+
    if pt_dummies != actual_pt_dummies:
        if overwrite:
            print("Updating transformers.utils.dummy_pt_objects.py as the main __init__ has new objects.")