Pipeline testing - using tiny models on Hub (#20426)

* rework pipeline tests * run pipeline tests * fix * fix * fix * revert the changes in get_test_pipeline() parameter list * fix expected error message * skip a test * clean up --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2023-01-30 10:39:43 +01:00
parent a582cfce3c
commit c749bd405e
26 changed files with 286 additions and 274 deletions
--- a/utils/create_dummy_models.py
+++ b/utils/create_dummy_models.py
@@ -15,6 +15,7 @@

 import argparse
 import collections.abc
+import copy
 import importlib
 import inspect
 import json
@@ -31,6 +32,7 @@ from huggingface_hub import Repository, create_repo, upload_folder
 from transformers import (
    CONFIG_MAPPING,
    FEATURE_EXTRACTOR_MAPPING,
+    IMAGE_PROCESSOR_MAPPING,
    PROCESSOR_MAPPING,
    TOKENIZER_MAPPING,
    AutoTokenizer,
@@ -74,29 +76,36 @@ def get_processor_types_from_config_class(config_class, allowed_mappings=None):

    We use `tuple` here to include (potentially) both slow & fast tokenizers.
    """
+
+    # To make a uniform return type
+    def _to_tuple(x):
+        if not isinstance(x, collections.abc.Sequence):
+            x = (x,)
+        else:
+            x = tuple(x)
+        return x
+
    if allowed_mappings is None:
-        allowed_mappings = ["processor", "tokenizer", "feature_extractor"]
+        allowed_mappings = ["processor", "tokenizer", "image_processor", "feature_extractor"]

    processor_types = ()

-    # Check first if a model has `ProcessorMixin`. Otherwise, check if it has tokenizers or a feature extractor.
+    # Check first if a model has `ProcessorMixin`. Otherwise, check if it has tokenizers, and/or an image processor or
+    # a feature extractor
    if config_class in PROCESSOR_MAPPING and "processor" in allowed_mappings:
-        processor_types = PROCESSOR_MAPPING[config_class]
-    elif config_class in TOKENIZER_MAPPING and "tokenizer" in allowed_mappings:
-        processor_types = TOKENIZER_MAPPING[config_class]
-    elif config_class in FEATURE_EXTRACTOR_MAPPING and "feature_extractor" in allowed_mappings:
-        processor_types = FEATURE_EXTRACTOR_MAPPING[config_class]
+        processor_types = _to_tuple(PROCESSOR_MAPPING[config_class])
    else:
-        # Some configurations have no processor at all. For example, generic composite models like
-        # `EncoderDecoderModel` is used for any (compatible) text models. Also, `DecisionTransformer` doesn't
-        # require any processor.
-        pass
+        if config_class in TOKENIZER_MAPPING and "tokenizer" in allowed_mappings:
+            processor_types = TOKENIZER_MAPPING[config_class]

-    # make a uniform return type
-    if not isinstance(processor_types, collections.abc.Sequence):
-        processor_types = (processor_types,)
-    else:
-        processor_types = tuple(processor_types)
+        if config_class in IMAGE_PROCESSOR_MAPPING and "image_processor" in allowed_mappings:
+            processor_types += _to_tuple(IMAGE_PROCESSOR_MAPPING[config_class])
+        elif config_class in FEATURE_EXTRACTOR_MAPPING and "feature_extractor" in allowed_mappings:
+            processor_types += _to_tuple(FEATURE_EXTRACTOR_MAPPING[config_class])
+
+    # Remark: some configurations have no processor at all. For example, generic composite models like
+    # `EncoderDecoderModel` is used for any (compatible) text models. Also, `DecisionTransformer` doesn't
+    # require any processor.

    # We might get `None` for some tokenizers - remove them here.
    processor_types = tuple(p for p in processor_types if p is not None)
@@ -154,7 +163,7 @@ def get_config_class_from_processor_class(processor_class):
    return new_config_class


-def build_processor(config_class, processor_class):
+def build_processor(config_class, processor_class, allow_no_checkpoint=False):
    """Create a processor for `processor_class`.

    If a processor is not able to be built with the original arguments, this method tries to change the arguments and
@@ -264,6 +273,18 @@ def build_processor(config_class, processor_class):
            if config_class_from_processor_class != config_class:
                processor = build_processor(config_class_from_processor_class, processor_class)

+    # Try to create an image processor or a feature extractor without any checkpoint
+    if (
+        processor is None
+        and allow_no_checkpoint
+        and (issubclass(processor_class, BaseImageProcessor) or issubclass(processor_class, FeatureExtractionMixin))
+    ):
+        try:
+            processor = processor_class()
+        except Exception as e:
+            logger.error(e)
+            pass
+
    # validation
    if processor is not None:
        if not (isinstance(processor, processor_class) or processor_class.__name__.startswith("Auto")):
@@ -458,6 +479,18 @@ def convert_processors(processors, tiny_config, output_folder, result):
        result["warnings"].append(f"Failed to convert feature extractors: {e}")
        feature_extractors = []

+    if hasattr(tiny_config, "max_position_embeddings") and tiny_config.max_position_embeddings > 0:
+        if fast_tokenizer is not None:
+            if fast_tokenizer.__class__.__name__ in ["RobertaTokenizerFast", "XLMRobertaTokenizerFast"]:
+                fast_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
+            else:
+                fast_tokenizer.model_max_length = tiny_config.max_position_embeddings
+        if slow_tokenizer is not None:
+            if slow_tokenizer.__class__.__name__ in ["RobertaTokenizer", "XLMRobertaTokenizer"]:
+                slow_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
+            else:
+                slow_tokenizer.model_max_length = tiny_config.max_position_embeddings
+
    processors = [fast_tokenizer, slow_tokenizer] + feature_extractors
    processors = [p for p in processors if p is not None]
    for p in processors:
@@ -491,6 +524,12 @@ def build_model(model_arch, tiny_config, output_dir):
    if os.path.isdir(processor_output_dir):
        shutil.copytree(processor_output_dir, checkpoint_dir, dirs_exist_ok=True)

+    tiny_config = copy.deepcopy(tiny_config)
+
+    if any([model_arch.__name__.endswith(x) for x in ["ForCausalLM", "LMHeadModel"]]):
+        tiny_config.is_encoder_decoder = False
+        tiny_config.is_decoder = True
+
    model = model_arch(config=tiny_config)
    model.save_pretrained(checkpoint_dir)
    model.from_pretrained(checkpoint_dir)
@@ -819,7 +858,7 @@ def build(config_class, models_to_create, output_dir):

    for processor_class in processor_classes:
        try:
-            processor = build_processor(config_class, processor_class)
+            processor = build_processor(config_class, processor_class, allow_no_checkpoint=True)
            if processor is not None:
                result["processor"][processor_class] = processor
        except Exception as e: