Pipeline testing - using tiny models on Hub (#20426)

* rework pipeline tests

* run pipeline tests

* fix

* fix

* fix

* revert the changes in get_test_pipeline() parameter list

* fix expected error message

* skip a test

* clean up

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar
2023-01-30 10:39:43 +01:00
committed by GitHub
parent a582cfce3c
commit c749bd405e
26 changed files with 286 additions and 274 deletions

View File

@@ -15,6 +15,7 @@
import argparse
import collections.abc
import copy
import importlib
import inspect
import json
@@ -31,6 +32,7 @@ from huggingface_hub import Repository, create_repo, upload_folder
from transformers import (
CONFIG_MAPPING,
FEATURE_EXTRACTOR_MAPPING,
IMAGE_PROCESSOR_MAPPING,
PROCESSOR_MAPPING,
TOKENIZER_MAPPING,
AutoTokenizer,
@@ -74,29 +76,36 @@ def get_processor_types_from_config_class(config_class, allowed_mappings=None):
We use `tuple` here to include (potentially) both slow & fast tokenizers.
"""
# To make a uniform return type
def _to_tuple(x):
if not isinstance(x, collections.abc.Sequence):
x = (x,)
else:
x = tuple(x)
return x
if allowed_mappings is None:
allowed_mappings = ["processor", "tokenizer", "feature_extractor"]
allowed_mappings = ["processor", "tokenizer", "image_processor", "feature_extractor"]
processor_types = ()
# Check first if a model has `ProcessorMixin`. Otherwise, check if it has tokenizers or a feature extractor.
# Check first if a model has `ProcessorMixin`. Otherwise, check if it has tokenizers, and/or an image processor or
# a feature extractor
if config_class in PROCESSOR_MAPPING and "processor" in allowed_mappings:
processor_types = PROCESSOR_MAPPING[config_class]
elif config_class in TOKENIZER_MAPPING and "tokenizer" in allowed_mappings:
processor_types = TOKENIZER_MAPPING[config_class]
elif config_class in FEATURE_EXTRACTOR_MAPPING and "feature_extractor" in allowed_mappings:
processor_types = FEATURE_EXTRACTOR_MAPPING[config_class]
processor_types = _to_tuple(PROCESSOR_MAPPING[config_class])
else:
# Some configurations have no processor at all. For example, generic composite models like
# `EncoderDecoderModel` is used for any (compatible) text models. Also, `DecisionTransformer` doesn't
# require any processor.
pass
if config_class in TOKENIZER_MAPPING and "tokenizer" in allowed_mappings:
processor_types = TOKENIZER_MAPPING[config_class]
# make a uniform return type
if not isinstance(processor_types, collections.abc.Sequence):
processor_types = (processor_types,)
else:
processor_types = tuple(processor_types)
if config_class in IMAGE_PROCESSOR_MAPPING and "image_processor" in allowed_mappings:
processor_types += _to_tuple(IMAGE_PROCESSOR_MAPPING[config_class])
elif config_class in FEATURE_EXTRACTOR_MAPPING and "feature_extractor" in allowed_mappings:
processor_types += _to_tuple(FEATURE_EXTRACTOR_MAPPING[config_class])
# Remark: some configurations have no processor at all. For example, generic composite models like
# `EncoderDecoderModel` is used for any (compatible) text models. Also, `DecisionTransformer` doesn't
# require any processor.
# We might get `None` for some tokenizers - remove them here.
processor_types = tuple(p for p in processor_types if p is not None)
@@ -154,7 +163,7 @@ def get_config_class_from_processor_class(processor_class):
return new_config_class
def build_processor(config_class, processor_class):
def build_processor(config_class, processor_class, allow_no_checkpoint=False):
"""Create a processor for `processor_class`.
If a processor is not able to be built with the original arguments, this method tries to change the arguments and
@@ -264,6 +273,18 @@ def build_processor(config_class, processor_class):
if config_class_from_processor_class != config_class:
processor = build_processor(config_class_from_processor_class, processor_class)
# Try to create an image processor or a feature extractor without any checkpoint
if (
processor is None
and allow_no_checkpoint
and (issubclass(processor_class, BaseImageProcessor) or issubclass(processor_class, FeatureExtractionMixin))
):
try:
processor = processor_class()
except Exception as e:
logger.error(e)
pass
# validation
if processor is not None:
if not (isinstance(processor, processor_class) or processor_class.__name__.startswith("Auto")):
@@ -458,6 +479,18 @@ def convert_processors(processors, tiny_config, output_folder, result):
result["warnings"].append(f"Failed to convert feature extractors: {e}")
feature_extractors = []
if hasattr(tiny_config, "max_position_embeddings") and tiny_config.max_position_embeddings > 0:
if fast_tokenizer is not None:
if fast_tokenizer.__class__.__name__ in ["RobertaTokenizerFast", "XLMRobertaTokenizerFast"]:
fast_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
else:
fast_tokenizer.model_max_length = tiny_config.max_position_embeddings
if slow_tokenizer is not None:
if slow_tokenizer.__class__.__name__ in ["RobertaTokenizer", "XLMRobertaTokenizer"]:
slow_tokenizer.model_max_length = tiny_config.max_position_embeddings - 2
else:
slow_tokenizer.model_max_length = tiny_config.max_position_embeddings
processors = [fast_tokenizer, slow_tokenizer] + feature_extractors
processors = [p for p in processors if p is not None]
for p in processors:
@@ -491,6 +524,12 @@ def build_model(model_arch, tiny_config, output_dir):
if os.path.isdir(processor_output_dir):
shutil.copytree(processor_output_dir, checkpoint_dir, dirs_exist_ok=True)
tiny_config = copy.deepcopy(tiny_config)
if any([model_arch.__name__.endswith(x) for x in ["ForCausalLM", "LMHeadModel"]]):
tiny_config.is_encoder_decoder = False
tiny_config.is_decoder = True
model = model_arch(config=tiny_config)
model.save_pretrained(checkpoint_dir)
model.from_pretrained(checkpoint_dir)
@@ -819,7 +858,7 @@ def build(config_class, models_to_create, output_dir):
for processor_class in processor_classes:
try:
processor = build_processor(config_class, processor_class)
processor = build_processor(config_class, processor_class, allow_no_checkpoint=True)
if processor is not None:
result["processor"][processor_class] = processor
except Exception as e: