Make tiny model creation + pipeline testing more robust (#22500)

* Final Tiny things --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2023-04-06 17:45:55 +02:00
parent 12d51db243
commit 2c22bc79c2
8 changed files with 161 additions and 64 deletions
--- a/utils/create_dummy_models.py
+++ b/utils/create_dummy_models.py
@@ -18,6 +18,7 @@ import collections.abc
 import copy
 import inspect
 import json
+import multiprocessing
 import os
 import shutil
 import tempfile
@@ -679,12 +680,22 @@ def convert_processors(processors, tiny_config, output_folder, result):

    if hasattr(tiny_config, "max_position_embeddings") and tiny_config.max_position_embeddings > 0:
        if fast_tokenizer is not None:
-            if fast_tokenizer.__class__.__name__ in ["RobertaTokenizerFast", "XLMRobertaTokenizerFast"]:
+            if fast_tokenizer.__class__.__name__ in [
+                "RobertaTokenizerFast",
+                "XLMRobertaTokenizerFast",
+                "LongformerTokenizerFast",
+                "MPNetTokenizerFast",
+            ]:
                fast_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
            else:
                fast_tokenizer.model_max_length = tiny_config.max_position_embeddings
        if slow_tokenizer is not None:
-            if slow_tokenizer.__class__.__name__ in ["RobertaTokenizer", "XLMRobertaTokenizer"]:
+            if slow_tokenizer.__class__.__name__ in [
+                "RobertaTokenizer",
+                "XLMRobertaTokenizer",
+                "LongformerTokenizer",
+                "MPNetTokenizer",
+            ]:
                slow_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
            else:
                slow_tokenizer.model_max_length = tiny_config.max_position_embeddings
@@ -1047,6 +1058,10 @@ def build(config_class, models_to_create, output_dir):
            The directory to save all the checkpoints. Each model architecture will be saved in a subdirectory under
            it. Models in different frameworks with the same architecture will be saved in the same subdirectory.
    """
+    if data["training_ds"] is None or data["testing_ds"] is None:
+        ds = load_dataset("wikitext", "wikitext-2-raw-v1")
+        data["training_ds"] = ds["train"]
+        data["testing_ds"] = ds["test"]

    if config_class.model_type in [
        "encoder-decoder",
@@ -1323,6 +1338,7 @@ def create_tiny_models(
    upload,
    organization,
    token,
+    num_workers=1,
 ):
    clone_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
    if os.getcwd() != clone_path:
@@ -1343,10 +1359,6 @@ def create_tiny_models(
    pytorch_arch_mappings = [getattr(transformers_module, x) for x in _pytorch_arch_mappings]
    tensorflow_arch_mappings = [getattr(transformers_module, x) for x in _tensorflow_arch_mappings]

-    ds = load_dataset("wikitext", "wikitext-2-raw-v1")
-    data["training_ds"] = ds["train"]
-    data["testing_ds"] = ds["test"]
-
    config_classes = CONFIG_MAPPING.values()
    if not all:
        config_classes = [CONFIG_MAPPING[model_type] for model_type in model_types]
@@ -1363,11 +1375,19 @@ def create_tiny_models(
            to_create[c] = {"processor": processors, "pytorch": models, "tensorflow": tf_models}

    results = {}
-    for c, models_to_create in list(to_create.items()):
-        print(f"Create models for {c.__name__} ...")
-        result = build(c, models_to_create, output_dir=os.path.join(output_path, c.model_type))
-        results[c.__name__] = result
-        print("=" * 40)
+    if num_workers <= 1:
+        for c, models_to_create in list(to_create.items()):
+            print(f"Create models for {c.__name__} ...")
+            result = build(c, models_to_create, output_dir=os.path.join(output_path, c.model_type))
+            results[c.__name__] = result
+            print("=" * 40)
+    else:
+        all_build_args = []
+        for c, models_to_create in list(to_create.items()):
+            all_build_args.append((c, models_to_create, os.path.join(output_path, c.model_type)))
+        with multiprocessing.Pool() as pool:
+            results = pool.starmap(build, all_build_args)
+            results = {buid_args[0].__name__: result for buid_args, result in zip(all_build_args, results)}

    if upload:
        if organization is None:
@@ -1426,9 +1446,8 @@ def create_tiny_models(


 if __name__ == "__main__":
-    ds = load_dataset("wikitext", "wikitext-2-raw-v1")
-    training_ds = ds["train"]
-    testing_ds = ds["test"]
+    # This has to be `spawn` to avoid hanging forever!
+    multiprocessing.set_start_method("spawn")

    def list_str(values):
        return values.split(",")
@@ -1465,6 +1484,7 @@ if __name__ == "__main__":
        "--token", default=None, type=str, help="A valid authentication token for HuggingFace Hub with write access."
    )
    parser.add_argument("output_path", type=Path, help="Path indicating where to store generated model.")
+    parser.add_argument("--num_workers", default=1, type=int, help="The number of workers to run.")

    args = parser.parse_args()

@@ -1480,4 +1500,5 @@ if __name__ == "__main__":
        args.upload,
        args.organization,
        args.token,
+        args.num_workers,
    )
--- a/utils/update_tiny_models.py
+++ b/utils/update_tiny_models.py
@@ -21,8 +21,10 @@ version of `tests/utils/tiny_model_summary.json`. That updated file should be me
 """


+import argparse
 import copy
 import json
+import multiprocessing
 import os
 import time

@@ -197,6 +199,13 @@ def update_tiny_model_summary_file(report_path):


 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_workers", default=1, type=int, help="The number of workers to run.")
+    args = parser.parse_args()
+
+    # This has to be `spawn` to avoid hanging forever!
+    multiprocessing.set_start_method("spawn")
+
    output_path = "tiny_models"
    all = True
    model_types = None
@@ -214,6 +223,7 @@ if __name__ == "__main__":
        upload,
        organization,
        token=os.environ.get("TOKEN", None),
+        num_workers=args.num_workers,
    )

    update_tiny_model_summary_file(report_path=os.path.join(output_path, "reports"))